From f5d1c24760d90003c1a577c696ac5de23a289e64 Mon Sep 17 00:00:00 2001 From: Ray Douglass Date: Mon, 20 May 2024 17:38:30 -0400 Subject: [PATCH 001/340] DOC v24.08 Updates [skip ci] --- .../cuda11.8-conda/devcontainer.json | 6 +-- .devcontainer/cuda11.8-pip/devcontainer.json | 6 +-- .../cuda12.2-conda/devcontainer.json | 6 +-- .devcontainer/cuda12.2-pip/devcontainer.json | 6 +-- .github/workflows/build.yaml | 16 ++++---- .github/workflows/pandas-tests.yaml | 2 +- .github/workflows/pr.yaml | 40 +++++++++---------- .github/workflows/test.yaml | 22 +++++----- README.md | 2 +- VERSION | 2 +- .../all_cuda-118_arch-x86_64.yaml | 10 ++--- .../all_cuda-122_arch-x86_64.yaml | 10 ++--- cpp/examples/versions.cmake | 2 +- dependencies.yaml | 32 +++++++-------- java/ci/README.md | 4 +- java/pom.xml | 2 +- python/cudf/pyproject.toml | 4 +- python/cudf_kafka/pyproject.toml | 2 +- python/cudf_polars/pyproject.toml | 2 +- python/custreamz/pyproject.toml | 4 +- python/dask_cudf/pyproject.toml | 6 +-- 21 files changed, 93 insertions(+), 93 deletions(-) diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 944a73ecc98..c62e18512a0 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda" + "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 8b802333bda..4ab4bd75643 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip" + "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json index 886b07025cc..2b50454410f 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.2-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "12.2", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda" + "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json index 86df56ada19..fc5abc56094 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.2-pip/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "12.2", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip" + "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 6942ef0009d..c5679cc5141 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -101,7 +101,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index 60544294809..a8643923a4d 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,7 +17,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) build_type: nightly diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index f9d5976f1fe..cb582df21e0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -32,41 +32,41 @@ jobs: - pandas-tests - pandas-tests-diff secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 with: build_type: pull-request script: "ci/test_python_cudf.sh" @@ -74,14 +74,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 with: build_type: pull-request script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -91,7 +91,7 @@ jobs: static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -101,7 +101,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -111,7 +111,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -121,21 +121,21 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -144,7 +144,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -152,7 +152,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08 with: arch: '["amd64"]' cuda: '["12.2"]' @@ -163,7 +163,7 @@ jobs: unit-tests-cudf-pandas: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request @@ -172,7 +172,7 @@ jobs: # run the Pandas unit tests using PR branch needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) build_type: pull-request @@ -182,7 +182,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 170f45e23fd..36c9088d93c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -45,7 +45,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -54,7 +54,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +64,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -85,7 +85,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -97,7 +97,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -117,7 +117,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/README.md b/README.md index 205e16ea0e5..377998cd991 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=24.06 python=3.11 cuda-version=12.2 + cudf=24.08 python=3.11 cuda-version=12.2 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION index 0bff6981a3d..ec8489fda92 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -24.06.00 +24.08.00 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 48699b81eed..2ce1d9597e8 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -26,7 +26,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.6.* +- dask-cuda==24.8.* - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -43,10 +43,10 @@ dependencies: - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 -- libkvikio==24.6.* +- libkvikio==24.8.* - libparquet==16.0.0.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==24.6.* +- librmm==24.8.* - make - moto>=4.0.8 - msgpack-python @@ -76,9 +76,9 @@ dependencies: - python-confluent-kafka>=1.9.0,<1.10.0a0 - python>=3.9,<3.12 - pytorch>=2.1.0 -- rapids-dask-dependency==24.6.* +- rapids-dask-dependency==24.8.* - rich -- rmm==24.6.* +- rmm==24.8.* - s3fs>=2022.3.0 - scikit-build-core>=0.7.0 - scipy diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index d06a727f331..64d97dd742e 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -27,7 +27,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.6.* +- dask-cuda==24.8.* - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -42,10 +42,10 @@ dependencies: - libarrow==16.0.0.* - libcufile-dev - libcurand-dev -- libkvikio==24.6.* +- libkvikio==24.8.* - libparquet==16.0.0.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==24.6.* +- librmm==24.8.* - make - moto>=4.0.8 - msgpack-python @@ -74,9 +74,9 @@ dependencies: - python-confluent-kafka>=1.9.0,<1.10.0a0 - python>=3.9,<3.12 - pytorch>=2.1.0 -- rapids-dask-dependency==24.6.* +- rapids-dask-dependency==24.8.* - rich -- rmm==24.6.* +- rmm==24.8.* - s3fs>=2022.3.0 - scikit-build-core>=0.7.0 - scipy diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake index dff66b4d7d8..144b3d3721b 100644 --- a/cpp/examples/versions.cmake +++ b/cpp/examples/versions.cmake @@ -12,4 +12,4 @@ # the License. # ============================================================================= -set(CUDF_TAG branch-24.06) +set(CUDF_TAG branch-24.08) diff --git a/dependencies.yaml b/dependencies.yaml index f20c1591e73..39290fd2b93 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -270,8 +270,8 @@ dependencies: - output_types: conda packages: - fmt>=10.1.1,<11 - - librmm==24.6.* - - libkvikio==24.6.* + - librmm==24.8.* + - libkvikio==24.8.* - librdkafka>=1.9.0,<1.10.0a0 # Align nvcomp version with rapids-cmake - nvcomp==3.0.6 @@ -305,7 +305,7 @@ dependencies: common: - output_types: conda packages: - - &rmm_conda rmm==24.6.* + - &rmm_conda rmm==24.8.* - pip - pip: - git+https://github.com/python-streamz/streamz.git@master @@ -321,10 +321,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: &build_python_packages_cu12 - - &rmm_cu12 rmm-cu12==24.6.* + - &rmm_cu12 rmm-cu12==24.8.* - matrix: {cuda: "11.*"} packages: &build_python_packages_cu11 - - &rmm_cu11 rmm-cu11==24.6.* + - &rmm_cu11 rmm-cu11==24.8.* - {matrix: null, packages: [*rmm_conda] } libarrow_build: common: @@ -477,7 +477,7 @@ dependencies: - output_types: [conda] packages: - breathe>=4.35.0 - - dask-cuda==24.6.* + - dask-cuda==24.8.* - *doxygen - make - myst-nb @@ -568,11 +568,11 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - rmm-cu12==24.6.* + - rmm-cu12==24.8.* - pynvjitlink-cu12 - matrix: {cuda: "11.*"} packages: - - rmm-cu11==24.6.* + - rmm-cu11==24.8.* - cubinlinker-cu11 - ptxcompiler-cu11 - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]} @@ -585,7 +585,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - rapids-dask-dependency==24.6.* + - rapids-dask-dependency==24.8.* run_custreamz: common: - output_types: conda @@ -671,13 +671,13 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==24.6.* + - dask-cuda==24.8.* - *numba depends_on_cudf: common: - output_types: conda packages: - - &cudf_conda cudf==24.6.* + - &cudf_conda cudf==24.8.* - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -689,16 +689,16 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cudf-cu12==24.6.* + - cudf-cu12==24.8.* - matrix: {cuda: "11.*"} packages: - - cudf-cu11==24.6.* + - cudf-cu11==24.8.* - {matrix: null, packages: [*cudf_conda]} depends_on_cudf_kafka: common: - output_types: conda packages: - - &cudf_kafka_conda cudf_kafka==24.6.* + - &cudf_kafka_conda cudf_kafka==24.8.* - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -710,10 +710,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cudf_kafka-cu12==24.6.* + - cudf_kafka-cu12==24.8.* - matrix: {cuda: "11.*"} packages: - - cudf_kafka-cu11==24.6.* + - cudf_kafka-cu11==24.8.* - {matrix: null, packages: [*cudf_kafka_conda]} depends_on_cupy: common: diff --git a/java/ci/README.md b/java/ci/README.md index 18ad3cc4d0d..49481efab6b 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.06 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.08 ``` ### Build cuDF jar with devtoolset @@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-24.06.0-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-24.08.0-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index 46b5ce4c083..70230e6bc71 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 24.06.0-SNAPSHOT + 24.08.0-SNAPSHOT cudfjni diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 826362f0632..1b7bb106d49 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -8,7 +8,7 @@ requires = [ "ninja", "numpy==1.23.*", "pyarrow==16.0.0.*", - "rmm==24.6.*", + "rmm==24.8.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -36,7 +36,7 @@ dependencies = [ "ptxcompiler", "pyarrow>=16.0.0,<16.1.0a0", "rich", - "rmm==24.6.*", + "rmm==24.8.*", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 787dd8a97d7..b1bb4c5bd24 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -22,7 +22,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cudf==24.6.*", + "cudf==24.8.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.optional-dependencies] diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index de26a3eb51c..00fde6c0e05 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cudf==24.6.*", + "cudf==24.8.*", "polars>=0.20.24", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 7786bf98bef..f7e5698900a 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -19,8 +19,8 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "confluent-kafka>=1.9.0,<1.10.0a0", - "cudf==24.6.*", - "cudf_kafka==24.6.*", + "cudf==24.8.*", + "cudf_kafka==24.8.*", "streamz", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 5fbdd98225e..e353eac06b9 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -18,12 +18,12 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cudf==24.6.*", + "cudf==24.8.*", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<2.0a0", "pandas>=2.0,<2.2.3dev0", - "rapids-dask-dependency==24.6.*", + "rapids-dask-dependency==24.8.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -44,7 +44,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ - "dask-cuda==24.6.*", + "dask-cuda==24.8.*", "numba>=0.57", "pytest-cov", "pytest-xdist", From 333718ac90b8d98e026aa57cfa0084af4c68a0f3 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Tue, 21 May 2024 14:31:55 -0400 Subject: [PATCH 002/340] For powers of 10, replace ipow with switch (#15353) This adds a new runtime calculation of the power-of-10 needed for applying decimal scale factors with a switch statement. This provides the fastest way of applying the scale. Note that the multiply and divide operations are performed within the switch itself, so that the compiler sees the full instruction to optimize assembly code gen. See code comments for details. This cannot be used within fixed_point (e.g. for comparison operators and rescaling) as it introduced too much register pressure to unrelated benchmarks. It will only be used for the decimal <--> floating conversion, so it has been moved there to be in a new header file where that code will reside (in an upcoming PR). This is part of a larger change to change the algorithm for decimal <--> floating conversion to a more accurate one that is forthcoming soon. Authors: - Paul Mattione (https://github.com/pmattione-nvidia) Approvers: - Mark Harris (https://github.com/harrism) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/15353 --- .../cudf/fixed_point/floating_conversion.hpp | 374 ++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 cpp/include/cudf/fixed_point/floating_conversion.hpp diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp new file mode 100644 index 00000000000..492f7e75219 --- /dev/null +++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace numeric { + +/** + * @addtogroup floating_conversion + * @{ + * @file + * @brief fixed_point <--> floating-point conversion functions. + */ + +namespace detail { + +/** + * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an + * 128bit integer + * + * @note Intended to be run at compile time. + * + * @tparam Exp10 The power of 10 to calculate + * @return Returns 10^Exp10 + */ +template +constexpr __uint128_t large_power_of_10() +{ + // Stop at 10^19 to speed up compilation; literals can be used for smaller powers of 10. + static_assert(Exp10 >= 19); + if constexpr (Exp10 == 19) + return __uint128_t(10000000000000000000ULL); + else + return large_power_of_10() * __uint128_t(10); +} + +/** + * @brief Divide by a power of 10 that fits within a 32bit integer. + * + * @tparam T Type of value to be divided-from. + * @param value The number to be divided-from. + * @param exp10 The power-of-10 of the denominator, from 0 to 9 inclusive. + * @return Returns value / 10^exp10 + */ +template >* = nullptr> +CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10) +{ + // Computing division this way is much faster than the alternatives. + // Division is not implemented in GPU hardware, and the compiler will often implement it as a + // multiplication of the reciprocal of the denominator, requiring a conversion to floating point. + // Ths is especially slow for larger divides that have to use the FP64 pipeline, where threads + // bottleneck. + + // Instead, if the compiler can see exactly what number it is dividing by, it can + // produce much more optimal assembly, doing bit shifting, multiplies by a constant, etc. + // For the compiler to see the value though, array lookup (with exp10 as the index) + // is not sufficient: We have to use a switch statement. Although this introduces a branch, + // it is still much faster than doing the divide any other way. + // Perhaps an array can be used in C++23 with the assume attribute? + + // Since we're optimizing division this way, we have to do this for multiplication as well. + // That's because doing them in different ways (switch, array, runtime-computation, etc.) + // increases the register pressure on all kernels that use fixed_point types, specifically slowing + // down some of the PYMOD and join benchmarks. + + // This is split up into separate functions for 32-, 64-, and 128-bit denominators. + // That way we limit the templated, inlined code generation to the exponents that are + // capable of being represented. Combining them together into a single function again + // introduces too much pressure on the kernels that use this code, slowing down their benchmarks. + // It also dramatically slows down the compile time. + + switch (exp10) { + case 0: return value; + case 1: return value / 10U; + case 2: return value / 100U; + case 3: return value / 1000U; + case 4: return value / 10000U; + case 5: return value / 100000U; + case 6: return value / 1000000U; + case 7: return value / 10000000U; + case 8: return value / 100000000U; + case 9: return value / 1000000000U; + default: return 0; + } +} + +/** + * @brief Divide by a power of 10 that fits within a 64bit integer. + * + * @tparam T Type of value to be divided-from. + * @param value The number to be divided-from. + * @param exp10 The power-of-10 of the denominator, from 0 to 19 inclusive. + * @return Returns value / 10^exp10 + */ +template >* = nullptr> +CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10) +{ + // See comments in divide_power10_32bit() for discussion. + switch (exp10) { + case 0: return value; + case 1: return value / 10U; + case 2: return value / 100U; + case 3: return value / 1000U; + case 4: return value / 10000U; + case 5: return value / 100000U; + case 6: return value / 1000000U; + case 7: return value / 10000000U; + case 8: return value / 100000000U; + case 9: return value / 1000000000U; + case 10: return value / 10000000000ULL; + case 11: return value / 100000000000ULL; + case 12: return value / 1000000000000ULL; + case 13: return value / 10000000000000ULL; + case 14: return value / 100000000000000ULL; + case 15: return value / 1000000000000000ULL; + case 16: return value / 10000000000000000ULL; + case 17: return value / 100000000000000000ULL; + case 18: return value / 1000000000000000000ULL; + case 19: return value / 10000000000000000000ULL; + default: return 0; + } +} + +/** + * @brief Divide by a power of 10 that fits within a 128bit integer. + * + * @tparam T Type of value to be divided-from. + * @param value The number to be divided-from. + * @param exp10 The power-of-10 of the denominator, from 0 to 38 inclusive. + * @return Returns value / 10^exp10. + */ +template >* = nullptr> +CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10) +{ + // See comments in divide_power10_32bit() for an introduction. + switch (exp10) { + case 0: return value; + case 1: return value / 10U; + case 2: return value / 100U; + case 3: return value / 1000U; + case 4: return value / 10000U; + case 5: return value / 100000U; + case 6: return value / 1000000U; + case 7: return value / 10000000U; + case 8: return value / 100000000U; + case 9: return value / 1000000000U; + case 10: return value / 10000000000ULL; + case 11: return value / 100000000000ULL; + case 12: return value / 1000000000000ULL; + case 13: return value / 10000000000000ULL; + case 14: return value / 100000000000000ULL; + case 15: return value / 1000000000000000ULL; + case 16: return value / 10000000000000000ULL; + case 17: return value / 100000000000000000ULL; + case 18: return value / 1000000000000000000ULL; + case 19: return value / 10000000000000000000ULL; + case 20: return value / large_power_of_10<20>(); + case 21: return value / large_power_of_10<21>(); + case 22: return value / large_power_of_10<22>(); + case 23: return value / large_power_of_10<23>(); + case 24: return value / large_power_of_10<24>(); + case 25: return value / large_power_of_10<25>(); + case 26: return value / large_power_of_10<26>(); + case 27: return value / large_power_of_10<27>(); + case 28: return value / large_power_of_10<28>(); + case 29: return value / large_power_of_10<29>(); + case 30: return value / large_power_of_10<30>(); + case 31: return value / large_power_of_10<31>(); + case 32: return value / large_power_of_10<32>(); + case 33: return value / large_power_of_10<33>(); + case 34: return value / large_power_of_10<34>(); + case 35: return value / large_power_of_10<35>(); + case 36: return value / large_power_of_10<36>(); + case 37: return value / large_power_of_10<37>(); + case 38: return value / large_power_of_10<38>(); + default: return 0; + } +} + +/** + * @brief Multiply by a power of 10 that fits within a 32bit integer. + * + * @tparam T Type of value to be multiplied. + * @param value The number to be multiplied. + * @param exp10 The power-of-10 of the multiplier, from 0 to 9 inclusive. + * @return Returns value * 10^exp10 + */ +template >* = nullptr> +CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10) +{ + // See comments in divide_power10_32bit() for discussion. + switch (exp10) { + case 0: return value; + case 1: return value * 10U; + case 2: return value * 100U; + case 3: return value * 1000U; + case 4: return value * 10000U; + case 5: return value * 100000U; + case 6: return value * 1000000U; + case 7: return value * 10000000U; + case 8: return value * 100000000U; + case 9: return value * 1000000000U; + default: return 0; + } +} + +/** + * @brief Multiply by a power of 10 that fits within a 64bit integer. + * + * @tparam T Type of value to be multiplied. + * @param value The number to be multiplied. + * @param exp10 The power-of-10 of the multiplier, from 0 to 19 inclusive. + * @return Returns value * 10^exp10 + */ +template >* = nullptr> +CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10) +{ + // See comments in divide_power10_32bit() for discussion. + switch (exp10) { + case 0: return value; + case 1: return value * 10U; + case 2: return value * 100U; + case 3: return value * 1000U; + case 4: return value * 10000U; + case 5: return value * 100000U; + case 6: return value * 1000000U; + case 7: return value * 10000000U; + case 8: return value * 100000000U; + case 9: return value * 1000000000U; + case 10: return value * 10000000000ULL; + case 11: return value * 100000000000ULL; + case 12: return value * 1000000000000ULL; + case 13: return value * 10000000000000ULL; + case 14: return value * 100000000000000ULL; + case 15: return value * 1000000000000000ULL; + case 16: return value * 10000000000000000ULL; + case 17: return value * 100000000000000000ULL; + case 18: return value * 1000000000000000000ULL; + case 19: return value * 10000000000000000000ULL; + default: return 0; + } +} + +/** + * @brief Multiply by a power of 10 that fits within a 128bit integer. + * + * @tparam T Type of value to be multiplied. + * @param value The number to be multiplied. + * @param exp10 The power-of-10 of the multiplier, from 0 to 38 inclusive. + * @return Returns value * 10^exp10. + */ +template >* = nullptr> +CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10) +{ + // See comments in divide_power10_128bit() for discussion. + switch (exp10) { + case 0: return value; + case 1: return value * 10U; + case 2: return value * 100U; + case 3: return value * 1000U; + case 4: return value * 10000U; + case 5: return value * 100000U; + case 6: return value * 1000000U; + case 7: return value * 10000000U; + case 8: return value * 100000000U; + case 9: return value * 1000000000U; + case 10: return value * 10000000000ULL; + case 11: return value * 100000000000ULL; + case 12: return value * 1000000000000ULL; + case 13: return value * 10000000000000ULL; + case 14: return value * 100000000000000ULL; + case 15: return value * 1000000000000000ULL; + case 16: return value * 10000000000000000ULL; + case 17: return value * 100000000000000000ULL; + case 18: return value * 1000000000000000000ULL; + case 19: return value * 10000000000000000000ULL; + case 20: return value * large_power_of_10<20>(); + case 21: return value * large_power_of_10<21>(); + case 22: return value * large_power_of_10<22>(); + case 23: return value * large_power_of_10<23>(); + case 24: return value * large_power_of_10<24>(); + case 25: return value * large_power_of_10<25>(); + case 26: return value * large_power_of_10<26>(); + case 27: return value * large_power_of_10<27>(); + case 28: return value * large_power_of_10<28>(); + case 29: return value * large_power_of_10<29>(); + case 30: return value * large_power_of_10<30>(); + case 31: return value * large_power_of_10<31>(); + case 32: return value * large_power_of_10<32>(); + case 33: return value * large_power_of_10<33>(); + case 34: return value * large_power_of_10<34>(); + case 35: return value * large_power_of_10<35>(); + case 36: return value * large_power_of_10<36>(); + case 37: return value * large_power_of_10<37>(); + case 38: return value * large_power_of_10<38>(); + default: return 0; + } +} + +/** + * @brief Multiply an integer by a power of 10. + * + * @note Use this function if you have no a-priori knowledge of what exp10 might be. + * If you do, prefer calling the bit-size-specific versions + * + * @tparam Rep Representation type needed for integer exponentiation + * @tparam T Integral type of value to be multiplied. + * @param value The number to be multiplied. + * @param exp10 The power-of-10 of the multiplier. + * @return Returns value * 10^exp10 + */ +template )>* = nullptr> +CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int exp10) +{ + // Use this function if you have no knowledge of what exp10 might be + // If you do, prefer calling the bit-size-specific versions + if constexpr (sizeof(Rep) <= 4) { + return multiply_power10_32bit(value, exp10); + } else if constexpr (sizeof(Rep) <= 8) { + return multiply_power10_64bit(value, exp10); + } else { + return multiply_power10_128bit(value, exp10); + } +} + +/** + * @brief Divide an integer by a power of 10. + * + * @note Use this function if you have no a-priori knowledge of what exp10 might be. + * If you do, prefer calling the bit-size-specific versions + * + * @tparam Rep Representation type needed for integer exponentiation + * @tparam T Integral type of value to be divided-from. + * @param value The number to be divided-from. + * @param exp10 The power-of-10 of the denominator. + * @return Returns value / 10^exp10 + */ +template )>* = nullptr> +CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int exp10) +{ + // Use this function if you have no knowledge of what exp10 might be + // If you do, prefer calling the bit-size-specific versions + if constexpr (sizeof(Rep) <= 4) { + return divide_power10_32bit(value, exp10); + } else if constexpr (sizeof(Rep) <= 8) { + return divide_power10_64bit(value, exp10); + } else { + return divide_power10_128bit(value, exp10); + } +} + +} // namespace detail + +/** @} */ // end of group +} // namespace numeric From 24320a18563f1defd8bf7a164adebc066f8c7135 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Wed, 22 May 2024 12:01:24 -0500 Subject: [PATCH 003/340] Switch cuIO benchmarks to use pinned-pool host allocations by default. (#15805) Previously, the benchmarks used a non-pooled pinned memory allocator by default, and exposed an option to use an internally-declared pooled pinned allocator. Now that we have a pooled pinned allocator enabled in cuIO itself, this PR switches to using that as the new default for the benchmarks. Authors: - https://github.com/nvdbaranec Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/15805 --- cpp/benchmarks/fixture/nvbench_fixture.hpp | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp index ac0cab4071b..ebcbcb17e98 100644 --- a/cpp/benchmarks/fixture/nvbench_fixture.hpp +++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp @@ -45,8 +45,6 @@ static std::string cuio_host_mem_param{ * Initializes the default memory resource to use the RMM pool device resource. */ struct nvbench_base_fixture { - using host_pooled_mr_t = rmm::mr::pool_memory_resource; - inline auto make_cuda() { return std::make_shared(); } inline auto make_pool() @@ -90,22 +88,10 @@ struct nvbench_base_fixture { return *mr; } - inline rmm::host_async_resource_ref make_cuio_host_pinned_pool() - { - if (!this->host_pooled_mr) { - // Don't store in static, as the CUDA context may be destroyed before static destruction - this->host_pooled_mr = std::make_shared( - std::make_shared().get(), - size_t{1} * 1024 * 1024 * 1024); - } - - return *this->host_pooled_mr; - } - inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode) { if (mode == "pinned") return make_cuio_host_pinned(); - if (mode == "pinned_pool") return make_cuio_host_pinned_pool(); + if (mode == "pinned_pool") return cudf::io::get_host_memory_resource(); CUDF_FAIL("Unknown cuio_host_mem parameter: " + mode + "\nExpecting: pinned or pinned_pool"); } @@ -139,8 +125,7 @@ struct nvbench_base_fixture { std::shared_ptr mr; std::string rmm_mode{"pool"}; - std::shared_ptr host_pooled_mr; - std::string cuio_host_mode{"pinned"}; + std::string cuio_host_mode{"pinned_pool"}; }; } // namespace cudf From 9d8e43ef6ad75f6babc08fea88642ea006822e04 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 23 May 2024 11:41:49 -0400 Subject: [PATCH 004/340] Remove legacy JSON reader and concurrent_unordered_map.cuh. (#15813) This completes the final two steps and closes https://github.com/rapidsai/cudf/issues/15537. Also addresses one step of https://github.com/rapidsai/cudf/issues/12261. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - David Wendt (https://github.com/davidwendt) - Shruti Shivakumar (https://github.com/shrshi) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15813 --- cpp/CMakeLists.txt | 2 - cpp/include/cudf/io/json.hpp | 32 - cpp/src/groupby/hash/groupby.cu | 1 - cpp/src/hash/concurrent_unordered_map.cuh | 557 --------------- cpp/src/hash/managed.cuh | 41 -- cpp/src/io/json/legacy/json_gpu.cu | 615 ---------------- cpp/src/io/json/legacy/json_gpu.hpp | 99 --- cpp/src/io/json/legacy/read_json.hpp | 38 - cpp/src/io/json/legacy/reader_impl.cu | 667 ------------------ cpp/src/io/json/read_json.cu | 9 - cpp/tests/CMakeLists.txt | 4 - cpp/tests/hash_map/map_test.cu | 217 ------ cpp/tests/io/json_test.cpp | 49 +- cpp/tests/io/nested_json_test.cpp | 2 +- python/cudf/cudf/_lib/json.pyx | 2 - .../cudf/_lib/pylibcudf/libcudf/io/json.pxd | 3 - python/cudf/cudf/io/json.py | 1 - 17 files changed, 8 insertions(+), 2331 deletions(-) delete mode 100644 cpp/src/hash/concurrent_unordered_map.cuh delete mode 100644 cpp/src/hash/managed.cuh delete mode 100644 cpp/src/io/json/legacy/json_gpu.cu delete mode 100644 cpp/src/io/json/legacy/json_gpu.hpp delete mode 100644 cpp/src/io/json/legacy/read_json.hpp delete mode 100644 cpp/src/io/json/legacy/reader_impl.cu delete mode 100644 cpp/tests/hash_map/map_test.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7390c465ccb..228d21ddccb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -390,8 +390,6 @@ add_library( src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/read_json.cu - src/io/json/legacy/json_gpu.cu - src/io/json/legacy/reader_impl.cu src/io/json/parser_features.cpp src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index aa4bee4fb5e..65ba8f25577 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -270,15 +270,6 @@ class json_reader_options { */ bool is_enabled_dayfirst() const { return _dayfirst; } - /** - * @brief Whether the legacy reader should be used. - * - * @deprecated Since 24.06 - * - * @returns true if the legacy reader will be used, false otherwise - */ - [[deprecated]] bool is_enabled_legacy() const { return _legacy; } - /** * @brief Whether the reader should keep quotes of string values. * @@ -406,15 +397,6 @@ class json_reader_options { */ void enable_dayfirst(bool val) { _dayfirst = val; } - /** - * @brief Set whether to use the legacy reader. - * - * @deprecated Since 24.06 - * - * @param val Boolean value to enable/disable the legacy reader - */ - [[deprecated]] void enable_legacy(bool val) { _legacy = val; } - /** * @brief Set whether the reader should keep quotes of string values. * @@ -605,20 +587,6 @@ class json_reader_options_builder { return *this; } - /** - * @brief Set whether to use the legacy reader. - * - * @deprecated Since 24.06 - * - * @param val Boolean value to enable/disable legacy parsing - * @return this for chaining - */ - [[deprecated]] json_reader_options_builder& legacy(bool val) - { - options._legacy = val; - return *this; - } - /** * @brief Set whether the reader should keep quotes of string values. * diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 4f75ab19c66..0ec293ae3f0 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -16,7 +16,6 @@ #include "groupby/common/utils.hpp" #include "groupby/hash/groupby_kernels.cuh" -#include "hash/concurrent_unordered_map.cuh" #include #include diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh deleted file mode 100644 index a010a462de3..00000000000 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ /dev/null @@ -1,557 +0,0 @@ -/* - * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "hash/managed.cuh" - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include - -namespace { -template -struct packed { - using type = void; -}; -template <> -struct packed { - using type = uint64_t; -}; -template <> -struct packed { - using type = uint32_t; -}; -template -using packed_t = typename packed::type; - -/** - * @brief Indicates if a pair type can be packed. - * - * When the size of the key,value pair being inserted into the hash table is - * equal in size to a type where atomicCAS is natively supported, it is more - * efficient to "pack" the pair and insert it with a single atomicCAS. - * - * Only integral key and value types may be packed because we use - * bitwise equality comparison, which may not be valid for non-integral - * types. - * - * Also, the `pair_type` must not contain any padding bits otherwise - * accessing the packed value would be undefined. - * - * @tparam pair_type The pair type that will be packed - * @return true If the pair type can be packed - * @return false If the pair type cannot be packed - */ -template -constexpr bool is_packable() -{ - return std::is_integral_v and std::is_integral_v and - not std::is_void_v> and - std::has_unique_object_representations_v; -} - -/** - * @brief Allows viewing a pair in a packed representation - * - * Used as an optimization for inserting when a pair can be inserted with a - * single atomicCAS - */ -template -union pair_packer; - -template -union pair_packer()>> { - using packed_type = packed_t; - packed_type packed; - pair_type pair; - - __device__ pair_packer(pair_type _pair) : pair{_pair} {} - - __device__ pair_packer(packed_type _packed) : packed{_packed} {} -}; -} // namespace - -/** - * Supports concurrent insert, but not concurrent insert and find. - * - * @note The user is responsible for the following stream semantics: - * - Either the same stream should be used to create the map as is used by the kernels that access - * it, or - * - the stream used to create the map should be synchronized before it is accessed from a different - * stream or from host code. - * - * TODO: - * - add constructor that takes pointer to hash_table to avoid allocations - */ -template , - typename Equality = equal_to, - typename Allocator = rmm::mr::polymorphic_allocator>> -class concurrent_unordered_map { - public: - using size_type = size_t; - using hasher = Hasher; - using key_equal = Equality; - using allocator_type = Allocator; - using key_type = Key; - using mapped_type = Element; - using value_type = thrust::pair; - using iterator = cycle_iterator_adapter; - using const_iterator = cycle_iterator_adapter const; - - public: - /** - * @brief Factory to construct a new concurrent unordered map. - * - * Returns a `std::unique_ptr` to a new concurrent unordered map object. The - * map is non-owning and trivially copyable and should be passed by value into - * kernels. The `unique_ptr` contains a custom deleter that will free the - * map's contents. - * - * @note The implementation of this unordered_map uses sentinel values to - * indicate an entry in the hash table that is empty, i.e., if a hash bucket - * is empty, the pair residing there will be equal to (unused_key, - * unused_element). As a result, attempting to insert a key equal to - *`unused_key` results in undefined behavior. - * - * @note All allocations, kernels and copies in the constructor take place - * on stream but the constructor does not synchronize the stream. It is the user's - * responsibility to synchronize or use the same stream to access the map. - * - * @param capacity The maximum number of pairs the map may hold - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param unused_element The sentinel value to use for an empty value - * @param unused_key The sentinel value to use for an empty key - * @param hash_function The hash function to use for hashing keys - * @param equal The equality comparison function for comparing if two keys are - * equal - * @param allocator The allocator to use for allocation the hash table's - * storage - */ - static auto create(size_type capacity, - rmm::cuda_stream_view stream, - mapped_type const unused_element = std::numeric_limits::max(), - key_type const unused_key = std::numeric_limits::max(), - Hasher const& hash_function = hasher(), - Equality const& equal = key_equal(), - allocator_type const& allocator = allocator_type()) - { - CUDF_FUNC_RANGE(); - using Self = concurrent_unordered_map; - - // Note: need `(*p).destroy` instead of `p->destroy` here - // due to compiler bug: https://github.com/rapidsai/cudf/pull/5692 - auto deleter = [stream](Self* p) { (*p).destroy(stream); }; - - return std::unique_ptr>{ - new Self(capacity, unused_element, unused_key, hash_function, equal, allocator, stream), - deleter}; - } - - /** - * @brief Returns an iterator to the first element in the map - * - * @note `__device__` code that calls this function should either run in the - * same stream as `create()`, or the accessing stream either be running on the - * same stream as create(), or the accessing stream should be appropriately - * synchronized with the creating stream. - * - * @returns iterator to the first element in the map. - */ - __device__ iterator begin() - { - return iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values); - } - - /** - * @brief Returns a constant iterator to the first element in the map - * - * @note `__device__` code that calls this function should either run in the - * same stream as `create()`, or the accessing stream either be running on the - * same stream as create(), or the accessing stream should be appropriately - * synchronized with the creating stream. - * - * @returns constant iterator to the first element in the map. - */ - __device__ const_iterator begin() const - { - return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values); - } - - /** - * @brief Returns an iterator to the one past the last element in the map - * - * @note `__device__` code that calls this function should either run in the - * same stream as `create()`, or the accessing stream either be running on the - * same stream as create(), or the accessing stream should be appropriately - * synchronized with the creating stream. - * - * @returns iterator to the one past the last element in the map. - */ - __device__ iterator end() - { - return iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values + m_capacity); - } - - /** - * @brief Returns a constant iterator to the one past the last element in the map - * - * @note When called in a device code, user should make sure that it should - * either be running on the same stream as create(), or the accessing stream - * should be appropriately synchronized with the creating stream. - * - * @returns constant iterator to the one past the last element in the map. - */ - __device__ const_iterator end() const - { - return const_iterator( - m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values + m_capacity); - } - __host__ __device__ value_type* data() const { return m_hashtbl_values; } - - __host__ __device__ key_type get_unused_key() const { return m_unused_key; } - - __host__ __device__ mapped_type get_unused_element() const { return m_unused_element; } - - [[nodiscard]] __host__ __device__ size_type capacity() const { return m_capacity; } - - private: - /** - * @brief Enumeration of the possible results of attempting to insert into - *a hash bucket - */ - enum class insert_result { - CONTINUE, ///< Insert did not succeed, continue trying to insert - ///< (collision) - SUCCESS, ///< New pair inserted successfully - DUPLICATE ///< Insert did not succeed, key is already present - }; - - /** - * @brief Specialization for value types that can be packed. - * - * When the size of the key,value pair being inserted is equal in size to - *a type where atomicCAS is natively supported, this optimization path - *will insert the pair in a single atomicCAS operation. - */ - template - __device__ std::enable_if_t(), insert_result> attempt_insert( - value_type* const __restrict__ insert_location, value_type const& insert_pair) - { - pair_packer expected{thrust::make_pair(m_unused_key, m_unused_element)}; - pair_packer desired{insert_pair}; - - using packed_type = typename pair_packer::packed_type; - - auto* insert_ptr = reinterpret_cast(insert_location); - cuda::atomic_ref ref{*insert_ptr}; - auto const success = - ref.compare_exchange_strong(expected.packed, desired.packed, cuda::std::memory_order_relaxed); - - if (success) { - return insert_result::SUCCESS; - } else if (m_equal(expected.pair.first, insert_pair.first)) { - return insert_result::DUPLICATE; - } - return insert_result::CONTINUE; - } - - /** - * @brief Attempts to insert a key,value pair at the specified hash bucket. - * - * @param[in] insert_location Pointer to hash bucket to attempt insert - * @param[in] insert_pair The pair to insert - * @return Enum indicating result of insert attempt. - */ - template - __device__ std::enable_if_t(), insert_result> attempt_insert( - value_type* const __restrict__ insert_location, value_type const& insert_pair) - { - auto expected = m_unused_key; - cuda::atomic_ref ref{insert_location->first}; - auto const key_success = - ref.compare_exchange_strong(expected, insert_pair.first, cuda::std::memory_order_relaxed); - - // Hash bucket empty - if (key_success) { - insert_location->second = insert_pair.second; - return insert_result::SUCCESS; - } - // Key already exists - else if (m_equal(expected, insert_pair.first)) { - return insert_result::DUPLICATE; - } - - return insert_result::CONTINUE; - } - - public: - /** - * @brief Attempts to insert a key, value pair into the map. - * - * Returns an iterator, boolean pair. - * - * If the new key already present in the map, the iterator points to - * the location of the existing key and the boolean is `false` indicating - * that the insert did not succeed. - * - * If the new key was not present, the iterator points to the location - * where the insert occurred and the boolean is `true` indicating that the - *insert succeeded. - * - * @param insert_pair The key and value pair to insert - * @return Iterator, Boolean pair. Iterator is to the location of the - *newly inserted pair, or the existing pair that prevented the insert. - *Boolean indicates insert success. - */ - __device__ thrust::pair insert(value_type const& insert_pair) - { - size_type const key_hash{m_hf(insert_pair.first)}; - size_type index{key_hash % m_capacity}; - - insert_result status{insert_result::CONTINUE}; - - value_type* current_bucket{nullptr}; - - while (status == insert_result::CONTINUE) { - current_bucket = &m_hashtbl_values[index]; - status = attempt_insert(current_bucket, insert_pair); - index = (index + 1) % m_capacity; - } - - bool const insert_success = status == insert_result::SUCCESS; - - return thrust::make_pair( - iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket), insert_success); - } - - /** - * @brief Searches the map for the specified key. - * - * @note `find` is not threadsafe with `insert`. I.e., it is not safe to - *do concurrent `insert` and `find` operations. - * - * @param k The key to search for - * @return An iterator to the key if it exists, else map.end() - */ - __device__ const_iterator find(key_type const& k) const - { - size_type const key_hash = m_hf(k); - size_type index = key_hash % m_capacity; - - value_type* current_bucket = &m_hashtbl_values[index]; - - while (true) { - key_type const existing_key = current_bucket->first; - - if (m_unused_key == existing_key) { return this->end(); } - - if (m_equal(k, existing_key)) { - return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket); - } - - index = (index + 1) % m_capacity; - current_bucket = &m_hashtbl_values[index]; - } - } - - /** - * @brief Searches the map for the specified key. - * - * This version of the find function specifies a hashing function and an - * equality comparison. This allows the caller to use different functions - * for insert and find (for example, when you want to insert keys from - * one table and use find to match keys from a different table with the - * keys from the first table). - * - * @note `find` is not threadsafe with `insert`. I.e., it is not safe to - * do concurrent `insert` and `find` operations. - * - * @tparam find_hasher Type of hashing function - * @tparam find_key_equal Type of equality comparison - * - * @param k The key to search for - * @param f_hash The hashing function to use to hash this key - * @param f_equal The equality function to use to compare this key with the - * contents of the hash table - * @return An iterator to the key if it exists, else map.end() - */ - template - __device__ const_iterator find(key_type const& k, - find_hasher f_hash, - find_key_equal f_equal) const - { - size_type const key_hash = f_hash(k); - size_type index = key_hash % m_capacity; - - value_type* current_bucket = &m_hashtbl_values[index]; - - while (true) { - key_type const existing_key = current_bucket->first; - - if (m_unused_key == existing_key) { return this->end(); } - - if (f_equal(k, existing_key)) { - return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket); - } - - index = (index + 1) % m_capacity; - current_bucket = &m_hashtbl_values[index]; - } - } - - void assign_async(concurrent_unordered_map const& other, rmm::cuda_stream_view stream) - { - if (other.m_capacity <= m_capacity) { - m_capacity = other.m_capacity; - } else { - m_allocator.deallocate(m_hashtbl_values, m_capacity, stream); - m_capacity = other.m_capacity; - m_capacity = other.m_capacity; - - m_hashtbl_values = m_allocator.allocate(m_capacity, stream); - } - CUDF_CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values, - other.m_hashtbl_values, - m_capacity * sizeof(value_type), - cudaMemcpyDefault, - stream.value())); - } - - void clear_async(rmm::cuda_stream_view stream) - { - constexpr int block_size = 128; - init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( - m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); - } - - void print() - { - for (size_type i = 0; i < m_capacity; ++i) { - std::cout << i << ": " << m_hashtbl_values[i].first << "," << m_hashtbl_values[i].second - << std::endl; - } - } - - void prefetch(int const dev_id, rmm::cuda_stream_view stream) - { - cudaPointerAttributes hashtbl_values_ptr_attributes; - cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values); - - if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { - CUDF_CUDA_TRY(cudaMemPrefetchAsync( - m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value())); - } - CUDF_CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value())); - } - - /** - * @brief Frees the contents of the map and destroys the map object. - * - * This function is invoked as the deleter of the `std::unique_ptr` returned - * from the `create()` factory function. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void destroy(rmm::cuda_stream_view stream) - { - m_allocator.deallocate(m_hashtbl_values, m_capacity, stream); - delete this; - } - - concurrent_unordered_map() = delete; - concurrent_unordered_map(concurrent_unordered_map const&) = default; - concurrent_unordered_map(concurrent_unordered_map&&) = default; - concurrent_unordered_map& operator=(concurrent_unordered_map const&) = default; - concurrent_unordered_map& operator=(concurrent_unordered_map&&) = default; - ~concurrent_unordered_map() = default; - - private: - hasher m_hf; - key_equal m_equal; - mapped_type m_unused_element; - key_type m_unused_key; - allocator_type m_allocator; - size_type m_capacity; - value_type* m_hashtbl_values; - - /** - * @brief Private constructor used by `create` factory function. - * - * @param capacity The desired m_capacity of the hash table - * @param unused_element The sentinel value to use for an empty value - * @param unused_key The sentinel value to use for an empty key - * @param hash_function The hash function to use for hashing keys - * @param equal The equality comparison function for comparing if two keys - *are equal - * @param allocator The allocator to use for allocation the hash table's - * storage - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - concurrent_unordered_map(size_type capacity, - mapped_type const unused_element, - key_type const unused_key, - Hasher const& hash_function, - Equality const& equal, - allocator_type const& allocator, - rmm::cuda_stream_view stream) - : m_hf(hash_function), - m_equal(equal), - m_allocator(allocator), - m_capacity(capacity), - m_unused_element(unused_element), - m_unused_key(unused_key) - { - m_hashtbl_values = m_allocator.allocate(m_capacity, stream); - constexpr int block_size = 128; - { - cudaPointerAttributes hashtbl_values_ptr_attributes; - cudaError_t status = - cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values); - - if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { - int dev_id = 0; - CUDF_CUDA_TRY(cudaGetDevice(&dev_id)); - CUDF_CUDA_TRY(cudaMemPrefetchAsync( - m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value())); - } - } - - if (m_capacity > 0) { - init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( - m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); - } - - CUDF_CHECK_CUDA(stream.value()); - } -}; diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh deleted file mode 100644 index 9797c83c47c..00000000000 --- a/cpp/src/hash/managed.cuh +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2017-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -struct managed { - static void* operator new(size_t n) - { - void* ptr = nullptr; - cudaError_t result = cudaMallocManaged(&ptr, n); - if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc(); - return ptr; - } - - static void operator delete(void* ptr) noexcept - { - auto const free_result = cudaFree(ptr); - assert(free_result == cudaSuccess); - } -}; - -inline bool isPtrManaged(cudaPointerAttributes attr) -{ - return (attr.type == cudaMemoryTypeManaged); -} diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu deleted file mode 100644 index ff4845fcecb..00000000000 --- a/cpp/src/io/json/legacy/json_gpu.cu +++ /dev/null @@ -1,615 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "io/utilities/column_type_histogram.hpp" -#include "io/utilities/parsing_utils.cuh" -#include "io/utilities/trie.cuh" -#include "json_gpu.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using cudf::device_span; -using cudf::detail::grid_1d; - -namespace cudf::io::json::detail::legacy { - -namespace { -/** - * @brief CUDA Kernel that adjusts the row range to exclude the character outside of the top level - * brackets. - * - * The top level brackets characters are excluded from the resulting range. - * - * @param[in] begin Pointer to the first character in the row - * @param[in] end pointer to the first character after the row - */ -__device__ std::pair limit_range_to_brackets(char const* begin, - char const* end) -{ - auto const data_begin = thrust::next(thrust::find_if( - thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; })); - auto const data_end = thrust::next(thrust::find_if(thrust::seq, - thrust::make_reverse_iterator(end), - thrust::make_reverse_iterator(data_begin), - [](auto c) { return c == ']' || c == '}'; })) - .base(); - return {data_begin, data_end}; -} - -/** - * @brief Find the first JSON object key in the range. - * - * Assumes that begin is not in the middle of a field. - * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] quotechar The character used to denote quotes - * - * @return Begin and end iterators of the key name; (`end`, `end`) if a key is not found - */ -__device__ std::pair get_next_key(char const* begin, - char const* end, - char quotechar) -{ - // Key starts after the first quote - auto const key_begin = thrust::find(thrust::seq, begin, end, quotechar) + 1; - if (key_begin > end) return {end, end}; - - // Key ends after the next unescaped quote - auto const key_end_pair = thrust::mismatch( - thrust::seq, key_begin, end - 1, key_begin + 1, [quotechar] __device__(auto prev_ch, auto ch) { - return !(ch == quotechar && prev_ch != '\\'); - }); - - return {key_begin, key_end_pair.second}; -} - -/** - * @brief Returns true is the input character is a valid digit. - * Supports both decimal and hexadecimal digits (uppercase and lowercase). - * - * @param c Character to check - * @param is_hex Whether to check as a hexadecimal - * - * @return `true` if it is digit-like, `false` otherwise - */ -__device__ __inline__ bool is_digit(char c, bool is_hex = false) -{ - if (c >= '0' && c <= '9') return true; - - if (is_hex) { - if (c >= 'A' && c <= 'F') return true; - if (c >= 'a' && c <= 'f') return true; - } - - return false; -} - -/** - * @brief Returns true if the counters indicate a potentially valid float. - * False positives are possible because positions are not taken into account. - * For example, field "e.123-" would match the pattern. - */ -__device__ __inline__ bool is_like_float( - long len, long digit_cnt, long decimal_cnt, long dash_cnt, long exponent_cnt) -{ - // Can't have more than one exponent and one decimal point - if (decimal_cnt > 1) return false; - if (exponent_cnt > 1) return false; - // Without the exponent or a decimal point, this is an integer, not a float - if (decimal_cnt == 0 && exponent_cnt == 0) return false; - - // Can only have one '-' per component - if (dash_cnt > 1 + exponent_cnt) return false; - - // If anything other than these characters is present, it's not a float - if (digit_cnt + decimal_cnt + dash_cnt + exponent_cnt != len) return false; - - // Needs at least 1 digit, 2 if exponent is present - if (digit_cnt < 1 + exponent_cnt) return false; - - return true; -} - -/** - * @brief Contains information on a JSON file field. - */ -struct field_descriptor { - cudf::size_type column; - char const* value_begin; - char const* value_end; - bool is_quoted; -}; - -/** - * @brief Parse the first field in the given range and return its descriptor. - * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] opts The global parsing behavior options - * @param[in] field_idx Index of the current field in the input row - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @return Descriptor of the parsed field - */ -__device__ field_descriptor next_field_descriptor(char const* begin, - char const* end, - parse_options_view const& opts, - cudf::size_type field_idx, - col_map_type col_map) -{ - auto const desc_pre_trim = - col_map.capacity() == 0 - // No key - column and begin are trivial - ? field_descriptor{field_idx, - begin, - cudf::io::gpu::seek_field_end(begin, end, opts, true), - false} - : [&]() { - auto const key_range = get_next_key(begin, end, opts.quotechar); - auto const key_hash = cudf::hashing::detail::MurmurHash3_x86_32{}( - cudf::string_view(key_range.first, key_range.second - key_range.first)); - auto const hash_col = col_map.find(key_hash); - // Fall back to field index if not found (parsing error) - auto const column = (hash_col != col_map.end()) ? (*hash_col).second : field_idx; - - // Skip the colon between the key and the value - auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1; - return field_descriptor{column, - value_begin, - cudf::io::gpu::seek_field_end(value_begin, end, opts, true), - false}; - }(); - - // Modify start & end to ignore whitespace and quotechars - auto const trimmed_value_range = - trim_whitespaces(desc_pre_trim.value_begin, desc_pre_trim.value_end); - bool const is_quoted = - thrust::distance(trimmed_value_range.first, trimmed_value_range.second) >= 2 and - *trimmed_value_range.first == opts.quotechar and - *thrust::prev(trimmed_value_range.second) == opts.quotechar; - return {desc_pre_trim.column, - trimmed_value_range.first + static_cast(is_quoted), - trimmed_value_range.second - static_cast(is_quoted), - is_quoted}; -} - -/** - * @brief Returns the range that contains the data in a given row. - * - * Excludes the top-level brackets. - * - * @param[in] data Device span pointing to the JSON data in device memory - * @param[in] row_offsets The offset of each row in the input - * @param[in] row Index of the row for which the range is returned - * - * @return The begin and end iterators of the row data. - */ -__device__ std::pair get_row_data_range( - device_span const data, device_span const row_offsets, size_type row) -{ - auto const row_begin = data.begin() + row_offsets[row]; - auto const row_end = - data.begin() + ((row < row_offsets.size() - 1) ? row_offsets[row + 1] : data.size()); - return limit_range_to_brackets(row_begin, row_end); -} - -/** - * @brief CUDA kernel that parses and converts plain text data into cuDF column data. - * - * Data is processed one record at a time - * - * @param[in] opts A set of parsing options - * @param[in] data The entire data to read - * @param[in] row_offsets The offset of each row in the input - * @param[in] column_types The data type of each column - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @param[out] output_columns The output column data - * @param[out] valid_fields The bitmaps indicating whether column fields are valid - * @param[out] num_valid_fields The numbers of valid fields in columns - */ -CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts, - device_span const data, - device_span const row_offsets, - device_span const column_types, - col_map_type col_map, - device_span const output_columns, - device_span const valid_fields, - device_span const num_valid_fields) -{ - auto const rec_id = grid_1d::global_thread_id(); - if (rec_id >= row_offsets.size()) return; - - auto const row_data_range = get_row_data_range(data, row_offsets, rec_id); - - auto current = row_data_range.first; - for (size_type input_field_index = 0; - input_field_index < column_types.size() && current < row_data_range.second; - input_field_index++) { - auto const desc = - next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map); - auto const value_len = static_cast(std::max(desc.value_end - desc.value_begin, 0L)); - auto const is_quoted = static_cast(desc.is_quoted); - - current = desc.value_end + 1; - - using string_index_pair = thrust::pair; - - if (!serialized_trie_contains(opts.trie_na, - {desc.value_begin - is_quoted, value_len + is_quoted * 2})) { - // Type dispatcher does not handle strings - if (column_types[desc.column].id() == type_id::STRING) { - auto str_list = static_cast(output_columns[desc.column]); - str_list[rec_id].first = desc.value_begin; - str_list[rec_id].second = value_len; - - // set the valid bitmap - all bits were set to 0 to start - set_bit(valid_fields[desc.column], rec_id); - atomicAdd(&num_valid_fields[desc.column], 1); - } else { - if (cudf::type_dispatcher(column_types[desc.column], - ConvertFunctor{}, - desc.value_begin, - desc.value_end, - output_columns[desc.column], - rec_id, - column_types[desc.column], - opts, - false)) { - // set the valid bitmap - all bits were set to 0 to start - set_bit(valid_fields[desc.column], rec_id); - atomicAdd(&num_valid_fields[desc.column], 1); - } - } - } else if (column_types[desc.column].id() == type_id::STRING) { - auto str_list = static_cast(output_columns[desc.column]); - str_list[rec_id].first = nullptr; - str_list[rec_id].second = 0; - } - } -} - -/** - * @brief CUDA kernel that processes a buffer of data and determines information about the - * column types within. - * - * Data is processed in one row/record at a time, so the number of total - * threads (tid) is equal to the number of rows. - * - * @param[in] opts A set of parsing options - * @param[in] data Input data buffer - * @param[in] rec_starts The offset of each row in the input - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @param[in] num_columns The number of columns of input data - * @param[out] column_infos The count for each column data type - */ -CUDF_KERNEL void detect_data_types_kernel( - parse_options_view const opts, - device_span const data, - device_span const row_offsets, - col_map_type col_map, - int num_columns, - device_span const column_infos) -{ - auto const rec_id = grid_1d::global_thread_id(); - if (rec_id >= row_offsets.size()) return; - - auto const are_rows_objects = col_map.capacity() != 0; - auto const row_data_range = get_row_data_range(data, row_offsets, rec_id); - - size_type input_field_index = 0; - for (auto current = row_data_range.first; - input_field_index < num_columns && current < row_data_range.second; - input_field_index++) { - auto const desc = - next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map); - auto const value_len = static_cast(std::max(desc.value_end - desc.value_begin, 0L)); - - // Advance to the next field; +1 to skip the delimiter - current = desc.value_end + 1; - - // Checking if the field is empty/valid - if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { - // Increase the null count for array rows, where the null count is initialized to zero. - if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); } - continue; - } else if (are_rows_objects) { - // For files with object rows, null count is initialized to row count. The value is decreased - // here for every valid field. - atomicAdd(&column_infos[desc.column].null_count, -1); - } - // Don't need counts to detect strings, any field in quotes is deduced to be a string - if (desc.is_quoted) { - atomicAdd(&column_infos[desc.column].string_count, 1); - continue; - } - - int digit_count = 0; - int decimal_count = 0; - int slash_count = 0; - int dash_count = 0; - int plus_count = 0; - int colon_count = 0; - int exponent_count = 0; - int other_count = 0; - - bool const maybe_hex = - ((value_len > 2 && *desc.value_begin == '0' && *(desc.value_begin + 1) == 'x') || - (value_len > 3 && *desc.value_begin == '-' && *(desc.value_begin + 1) == '0' && - *(desc.value_begin + 2) == 'x')); - for (auto pos = desc.value_begin; pos < desc.value_end; ++pos) { - if (is_digit(*pos, maybe_hex)) { - digit_count++; - continue; - } - // Looking for unique characters that will help identify column types - switch (*pos) { - case '.': decimal_count++; break; - case '-': dash_count++; break; - case '+': plus_count++; break; - case '/': slash_count++; break; - case ':': colon_count++; break; - case 'e': - case 'E': - if (!maybe_hex && pos > desc.value_begin && pos < desc.value_end - 1) exponent_count++; - break; - default: other_count++; break; - } - } - - // Integers have to have the length of the string - int int_req_number_cnt = value_len; - // Off by one if they start with a minus sign - if ((*desc.value_begin == '-' || *desc.value_begin == '+') && value_len > 1) { - --int_req_number_cnt; - } - // Off by one if they are a hexadecimal number - if (maybe_hex) { --int_req_number_cnt; } - if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) || - serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) { - atomicAdd(&column_infos[desc.column].bool_count, 1); - } else if (digit_count == int_req_number_cnt) { - bool is_negative = (*desc.value_begin == '-'); - char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+')); - cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( - data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]); - atomicAdd(ptr, 1); - } else if (is_like_float( - value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { - atomicAdd(&column_infos[desc.column].float_count, 1); - } - // A date-time field cannot have more than 3 non-special characters - // A number field cannot have more than one decimal point - else if (other_count > 3 || decimal_count > 1) { - atomicAdd(&column_infos[desc.column].string_count, 1); - } else { - // A date field can have either one or two '-' or '\'; A legal combination will only have one - // of them To simplify the process of auto column detection, we are not covering all the - // date-time formation permutations - if ((dash_count > 0 && dash_count <= 2 && slash_count == 0) || - (dash_count == 0 && slash_count > 0 && slash_count <= 2)) { - if (colon_count <= 2) { - atomicAdd(&column_infos[desc.column].datetime_count, 1); - } else { - atomicAdd(&column_infos[desc.column].string_count, 1); - } - } else { - // Default field type is string - atomicAdd(&column_infos[desc.column].string_count, 1); - } - } - } - if (!are_rows_objects) { - // For array rows, mark missing fields as null - for (; input_field_index < num_columns; ++input_field_index) - atomicAdd(&column_infos[input_field_index].null_count, 1); - } -} - -/** - * @brief Input data range that contains a field in key:value format. - */ -struct key_value_range { - char const* key_begin; - char const* key_end; - char const* value_begin; - char const* value_end; -}; - -/** - * @brief Parse the next field in key:value format and return ranges of its parts. - */ -__device__ key_value_range get_next_key_value_range(char const* begin, - char const* end, - parse_options_view const& opts) -{ - auto const key_range = get_next_key(begin, end, opts.quotechar); - - // Colon between the key and the value - auto const colon = thrust::find(thrust::seq, key_range.second, end, ':'); - if (colon == end) return {end, end, end}; - - // Field value (including delimiters) - auto const value_end = cudf::io::gpu::seek_field_end(colon + 1, end, opts, true); - return {key_range.first, key_range.second, colon + 1, value_end}; -} - -/** - * @brief Cuda kernel that collects information about JSON object keys in the file. - * - * @param[in] options A set of parsing options - * @param[in] data Input data buffer - * @param[in] row_offsets The offset of each row in the input - * @param[out] keys_cnt Number of keys found in the file - * @param[out] keys_info optional, information (offset, length, hash) for each found key - */ -CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options, - device_span const data, - device_span const row_offsets, - unsigned long long int* keys_cnt, - thrust::optional keys_info) -{ - auto const rec_id = grid_1d::global_thread_id(); - if (rec_id >= row_offsets.size()) return; - - auto const row_data_range = get_row_data_range(data, row_offsets, rec_id); - - auto advance = [&](char const* begin) { - return get_next_key_value_range(begin, row_data_range.second, options); - }; - for (auto field_range = advance(row_data_range.first); - field_range.key_begin < row_data_range.second; - field_range = advance(field_range.value_end)) { - auto const idx = atomicAdd(keys_cnt, 1ULL); - if (keys_info.has_value()) { - auto const len = field_range.key_end - field_range.key_begin; - keys_info->column(0).element(idx) = field_range.key_begin - data.begin(); - keys_info->column(1).element(idx) = len; - keys_info->column(2).element(idx) = - cudf::hashing::detail::MurmurHash3_x86_32{}( - cudf::string_view(field_range.key_begin, len)); - } - } -} - -} // namespace - -/** - * @copydoc cudf::io::json::detail::legacy::convert_json_to_columns - */ -void convert_json_to_columns(parse_options_view const& opts, - device_span const data, - device_span const row_offsets, - device_span const column_types, - col_map_type* col_map, - device_span const output_columns, - device_span const valid_fields, - device_span num_valid_fields, - rmm::cuda_stream_view stream) -{ - int block_size; - int min_grid_size; - CUDF_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( - &min_grid_size, &block_size, convert_data_to_columns_kernel)); - - int const grid_size = (row_offsets.size() + block_size - 1) / block_size; - - convert_data_to_columns_kernel<<>>(opts, - data, - row_offsets, - column_types, - *col_map, - output_columns, - valid_fields, - num_valid_fields); - - CUDF_CHECK_CUDA(stream.value()); -} - -/** - * @copydoc cudf::io::json::detail::legacy::detect_data_types - */ - -std::vector detect_data_types( - parse_options_view const& options, - device_span const data, - device_span const row_offsets, - bool do_set_null_count, - int num_columns, - col_map_type* col_map, - rmm::cuda_stream_view stream) -{ - int block_size; - int min_grid_size; - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, detect_data_types_kernel)); - - auto d_column_infos = [&]() { - if (do_set_null_count) { - rmm::device_uvector d_column_infos(num_columns, stream); - // Set the null count to the row count (all fields assumes to be null). - thrust::generate( - rmm::exec_policy(stream), - d_column_infos.begin(), - d_column_infos.end(), - [num_records = static_cast(row_offsets.size())] __device__() { - return cudf::io::column_type_histogram{num_records}; - }); - return d_column_infos; - } else { - return cudf::detail::make_zeroed_device_uvector_async( - num_columns, stream, rmm::mr::get_current_device_resource()); - } - }(); - - // Calculate actual block count to use based on records count - int const grid_size = (row_offsets.size() + block_size - 1) / block_size; - - detect_data_types_kernel<<>>( - options, data, row_offsets, *col_map, num_columns, d_column_infos); - - return cudf::detail::make_std_vector_sync(d_column_infos, stream); -} - -/** - * @copydoc cudf::io::json::detail::legacy::collect_keys_info - */ -void collect_keys_info(parse_options_view const& options, - device_span const data, - device_span const row_offsets, - unsigned long long int* keys_cnt, - thrust::optional keys_info, - rmm::cuda_stream_view stream) -{ - int block_size; - int min_grid_size; - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, collect_keys_info_kernel)); - - // Calculate actual block count to use based on records count - int const grid_size = (row_offsets.size() + block_size - 1) / block_size; - - collect_keys_info_kernel<<>>( - options, data, row_offsets, keys_cnt, keys_info); - - CUDF_CHECK_CUDA(stream.value()); -} - -} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/legacy/json_gpu.hpp b/cpp/src/io/json/legacy/json_gpu.hpp deleted file mode 100644 index 853e30c9427..00000000000 --- a/cpp/src/io/json/legacy/json_gpu.hpp +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "hash/concurrent_unordered_map.cuh" -#include "io/utilities/column_type_histogram.hpp" -#include "io/utilities/parsing_utils.cuh" - -#include -#include -#include - -#include - -#include - -using cudf::device_span; - -namespace cudf::io::json::detail::legacy { - -using col_map_type = concurrent_unordered_map; -/** - * @brief Convert a buffer of input data (text) into raw cuDF column data. - * - * @param[in] options A set of parsing options - * @param[in] data The entire data to read - * @param[in] row_offsets The start of each data record - * @param[in] dtypes The data type of each column - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @param[out] output_columns The output column data - * @param[out] valid_fields The bitmaps indicating whether column fields are valid - * @param[out] num_valid_fields The numbers of valid fields in columns - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ -void convert_json_to_columns(parse_options_view const& options, - device_span data, - device_span row_offsets, - device_span column_types, - col_map_type* col_map, - device_span output_columns, - device_span valid_fields, - device_span num_valid_fields, - rmm::cuda_stream_view stream); - -/** - * @brief Process a buffer of data and determine information about the column types within. - * - * @param[in] options A set of parsing options - * @param[in] data Input data buffer - * @param[in] row_offsets The offset of each row in the input - * @param[in] num_columns The number of columns of input data - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @returns The count for each column data type - */ -std::vector detect_data_types( - parse_options_view const& options, - device_span data, - device_span row_offsets, - bool do_set_null_count, - int num_columns, - col_map_type* col_map, - rmm::cuda_stream_view stream); - -/** - * @brief Collects information about JSON object keys in the file. - * - * @param[in] options A set of parsing options - * @param[in] data Input data buffer - * @param[in] row_offsets The offset of each row in the input - * @param[out] keys_cnt Number of keys found in the file - * @param[out] keys_info optional, information (offset, length, hash) for each found key - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ -void collect_keys_info(parse_options_view const& options, - device_span data, - device_span row_offsets, - unsigned long long int* keys_cnt, - thrust::optional keys_info, - rmm::cuda_stream_view stream); - -} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp deleted file mode 100644 index 2c02fdd402f..00000000000 --- a/cpp/src/io/json/legacy/read_json.hpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include - -#include -#include - -namespace cudf::io { -class json_reader_options; // forward decl -} - -namespace cudf::io::json::detail::legacy { - -table_with_metadata read_json(host_span> sources, - json_reader_options const& reader_opts, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu deleted file mode 100644 index 846b3cfab4e..00000000000 --- a/cpp/src/io/json/legacy/reader_impl.cu +++ /dev/null @@ -1,667 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "hash/concurrent_unordered_map.cuh" -#include "io/comp/io_uncomp.hpp" -#include "io/utilities/column_buffer.hpp" -#include "io/utilities/parsing_utils.cuh" -#include "json_gpu.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using cudf::host_span; - -namespace cudf::io::json::detail::legacy { - -using col_map_ptr_type = std::unique_ptr>; - -/** - * @brief Aggregate the table containing keys info by their hash values. - * - * @param[in] info Table with columns containing key offsets, lengths and hashes, respectively - * - * @return Table with data aggregated by key hash values - */ -std::unique_ptr aggregate_keys_info(std::unique_ptr
info) -{ - auto const info_view = info->view(); - std::vector requests; - requests.emplace_back(groupby::aggregation_request{info_view.column(0)}); - requests.back().aggregations.emplace_back(make_min_aggregation()); - requests.back().aggregations.emplace_back(make_nth_element_aggregation(0)); - - requests.emplace_back(groupby::aggregation_request{info_view.column(1)}); - requests.back().aggregations.emplace_back(make_min_aggregation()); - requests.back().aggregations.emplace_back(make_nth_element_aggregation(0)); - - // Aggregate by hash values - groupby::groupby gb_obj( - table_view({info_view.column(2)}), null_policy::EXCLUDE, sorted::NO, {}, {}); - - auto result = gb_obj.aggregate(requests); // TODO: no stream parameter? - - std::vector> out_columns; - out_columns.emplace_back(std::move(result.second[0].results[0])); // offsets - out_columns.emplace_back(std::move(result.second[1].results[0])); // lengths - out_columns.emplace_back(std::move(result.first->release()[0])); // hashes - return std::make_unique
(std::move(out_columns)); -} - -/** - * @brief Initializes the (key hash -> column index) hash map. - */ -col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, - rmm::cuda_stream_view stream) -{ - auto key_col_map = col_map_type::create(column_name_hashes.size(), stream); - auto const column_data = column_name_hashes.data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - column_name_hashes.size(), - [map = *key_col_map, column_data] __device__(size_type idx) mutable { - map.insert(thrust::make_pair(column_data[idx], idx)); - }); - return key_col_map; -} - -/** - * @brief Create a table whose columns contain the information on JSON objects' keys. - * - * The columns contain name offsets in the file, name lengths and name hashes, respectively. - * - * @param[in] options Parsing options (e.g. delimiter and quotation character) - * @param[in] data Input JSON device data - * @param[in] row_offsets Device array of row start locations in the input buffer - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return std::unique_ptr
cudf table with three columns (offsets, lengths, hashes) - */ -std::unique_ptr
create_json_keys_info_table(parse_options_view const& parse_opts, - device_span const data, - device_span const row_offsets, - rmm::cuda_stream_view stream) -{ - // Count keys - rmm::device_scalar key_counter(0, stream); - collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {}, stream); - - // Allocate columns to store hash value, length, and offset of each JSON object key in the input - auto const num_keys = key_counter.value(stream); - std::vector> info_columns; - info_columns.emplace_back( - make_numeric_column(data_type(type_id::UINT64), num_keys, mask_state::UNALLOCATED, stream)); - info_columns.emplace_back( - make_numeric_column(data_type(type_id::UINT16), num_keys, mask_state::UNALLOCATED, stream)); - info_columns.emplace_back( - make_numeric_column(data_type(type_id::UINT32), num_keys, mask_state::UNALLOCATED, stream)); - // Create a table out of these columns to pass them around more easily - auto info_table = std::make_unique
(std::move(info_columns)); - auto const info_table_mdv = mutable_table_device_view::create(info_table->mutable_view(), stream); - - // Reset the key counter - now used for indexing - key_counter.set_value_to_zero_async(stream); - // Fill the allocated columns - collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream); - return info_table; -} - -/** - * @brief Extract the keys from the JSON file the name offsets/lengths. - */ -std::vector create_key_strings(char const* h_data, - table_view sorted_info, - rmm::cuda_stream_view stream) -{ - auto const num_cols = sorted_info.num_rows(); - std::vector h_offsets(num_cols); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(), - sorted_info.column(0).data(), - sizeof(uint64_t) * num_cols, - cudaMemcpyDefault, - stream.value())); - - std::vector h_lens(num_cols); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_lens.data(), - sorted_info.column(1).data(), - sizeof(uint16_t) * num_cols, - cudaMemcpyDefault, - stream.value())); - - std::vector names(num_cols); - std::transform(h_offsets.cbegin(), - h_offsets.cend(), - h_lens.cbegin(), - names.begin(), - [&](auto offset, auto len) { return std::string(h_data + offset, len); }); - return names; -} - -auto sort_keys_info_by_offset(std::unique_ptr
info) -{ - auto const agg_offset_col_view = info->get_column(0).view(); - return sort_by_key(info->view(), table_view({agg_offset_col_view})); -} - -/** - * @brief Extract JSON object keys from a JSON file. - * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @return Names of JSON object keys in the file - */ -std::pair, col_map_ptr_type> get_json_object_keys_hashes( - parse_options_view const& parse_opts, - host_span h_data, - device_span rec_starts, - device_span d_data, - rmm::cuda_stream_view stream) -{ - auto info = create_json_keys_info_table(parse_opts, d_data, rec_starts, stream); - - auto aggregated_info = aggregate_keys_info(std::move(info)); - auto sorted_info = sort_keys_info_by_offset(std::move(aggregated_info)); - - return {create_key_strings(h_data.data(), sorted_info->view(), stream), - create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; -} - -std::vector ingest_raw_input(host_span> sources, - compression_type compression, - size_t range_offset, - size_t range_size, - size_t range_size_padded) -{ - CUDF_FUNC_RANGE(); - // Iterate through the user defined sources and read the contents into the local buffer - size_t total_source_size = 0; - for (auto const& source : sources) { - total_source_size += source->size(); - } - total_source_size = total_source_size - (range_offset * sources.size()); - - auto buffer = std::vector(total_source_size); - - size_t bytes_read = 0; - for (auto const& source : sources) { - if (!source->is_empty()) { - auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); - auto destination = buffer.data() + bytes_read; - bytes_read += source->host_read(range_offset, data_size, destination); - } - } - - if (compression == compression_type::NONE) { - return buffer; - } else { - return decompress(compression, buffer); - } -} - -bool should_load_whole_source(json_reader_options const& reader_opts) -{ - return reader_opts.get_byte_range_offset() == 0 and // - reader_opts.get_byte_range_size() == 0; -} - -rmm::device_uvector find_record_starts(json_reader_options const& reader_opts, - host_span h_data, - device_span d_data, - rmm::cuda_stream_view stream) -{ - std::vector chars_to_count{'\n'}; - // Currently, ignoring lineterminations within quotes is handled by recording the records of both, - // and then filtering out the records that is a quotechar or a linetermination within a quotechar - // pair. - // If not starting at an offset, add an extra row to account for the first row in the file - cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0); - if (should_load_whole_source(reader_opts)) { - prefilter_count += count_all_from_set(d_data, chars_to_count, stream); - } else { - prefilter_count += count_all_from_set(h_data, chars_to_count, stream); - } - - rmm::device_uvector rec_starts(prefilter_count, stream); - - auto* find_result_ptr = rec_starts.data(); - // Manually adding an extra row to account for the first row in the file - if (reader_opts.get_byte_range_offset() == 0) { - find_result_ptr++; - CUDF_CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value())); - } - - std::vector chars_to_find{'\n'}; - // Passing offset = 1 to return positions AFTER the found character - if (should_load_whole_source(reader_opts)) { - find_all_from_set(d_data, chars_to_find, 1, find_result_ptr, stream); - } else { - find_all_from_set(h_data, chars_to_find, 1, find_result_ptr, stream); - } - - // Previous call stores the record positions as encountered by all threads - // Sort the record positions as subsequent processing may require filtering - // certain rows or other processing on specific records - thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end()); - - auto filtered_count = prefilter_count; - - // Exclude the ending newline as it does not precede a record start - if (h_data.back() == '\n') { filtered_count--; } - rec_starts.resize(filtered_count, stream); - - return rec_starts; -} - -/** - * @brief Uploads the relevant segment of the input json data onto the GPU. - * - * Sets the d_data_ data member. - * Only rows that need to be parsed are copied, based on the byte range - * Also updates the array of record starts to match the device data offset. - */ -rmm::device_uvector upload_data_to_device(json_reader_options const& reader_opts, - host_span h_data, - rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - size_t end_offset = h_data.size(); - - // Trim lines that are outside range - auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream); - - if (reader_opts.get_byte_range_size() != 0) { - auto it = h_rec_starts.end() - 1; - while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) { - end_offset = *it; - --it; - } - h_rec_starts.erase(it + 1, h_rec_starts.end()); - } - - // Resize to exclude rows outside of the range - // Adjust row start positions to account for the data subcopy - size_t start_offset = h_rec_starts.front(); - rec_starts.resize(h_rec_starts.size(), stream); - thrust::transform(rmm::exec_policy(stream), - rec_starts.begin(), - rec_starts.end(), - thrust::make_constant_iterator(start_offset), - rec_starts.begin(), - thrust::minus()); - - size_t const bytes_to_upload = end_offset - start_offset; - CUDF_EXPECTS(bytes_to_upload <= h_data.size(), - "Error finding the record within the specified byte range.\n"); - - // Upload the raw data that is within the rows of interest - return cudf::detail::make_device_uvector_async( - h_data.subspan(start_offset, bytes_to_upload), stream, rmm::mr::get_current_device_resource()); -} - -std::pair, col_map_ptr_type> get_column_names_and_map( - parse_options_view const& parse_opts, - host_span h_data, - device_span rec_starts, - device_span d_data, - rmm::cuda_stream_view stream) -{ - // If file only contains one row, use the file size for the row size - uint64_t first_row_len = d_data.size(); - if (rec_starts.size() > 1) { - // Set first_row_len to the offset of the second row, if it exists - CUDF_CUDA_TRY(cudaMemcpyAsync( - &first_row_len, rec_starts.data() + 1, sizeof(uint64_t), cudaMemcpyDefault, stream.value())); - } - std::vector first_row(first_row_len); - CUDF_CUDA_TRY(cudaMemcpyAsync(first_row.data(), - d_data.data(), - first_row_len * sizeof(char), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - - // Determine the row format between: - // JSON array - [val1, val2, ...] and - // JSON object - {"col1":val1, "col2":val2, ...} - // based on the top level opening bracket - auto const first_square_bracket = std::find(first_row.begin(), first_row.end(), '['); - auto const first_curly_bracket = std::find(first_row.begin(), first_row.end(), '{'); - CUDF_EXPECTS(first_curly_bracket != first_row.end() || first_square_bracket != first_row.end(), - "Input data is not a valid JSON file."); - // If the first opening bracket is '{', assume object format - if (first_curly_bracket < first_square_bracket) { - // use keys as column names if input rows are objects - return get_json_object_keys_hashes(parse_opts, h_data, rec_starts, d_data, stream); - } else { - int cols_found = 0; - bool quotation = false; - auto column_names = std::vector(); - for (size_t pos = 0; pos < first_row.size(); ++pos) { - // Flip the quotation flag if current character is a quotechar - if (first_row[pos] == parse_opts.quotechar) { - quotation = !quotation; - } - // Check if end of a column/row - else if (pos == first_row.size() - 1 || - (!quotation && first_row[pos] == parse_opts.delimiter)) { - column_names.emplace_back(std::to_string(cols_found++)); - } - } - return {column_names, col_map_type::create(0, stream)}; - } -} - -std::vector get_data_types(json_reader_options const& reader_opts, - parse_options_view const& parse_opts, - std::vector const& column_names, - col_map_type* column_map, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream) -{ - bool has_to_infer_column_types = - std::visit([](auto const& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); - - if (!has_to_infer_column_types) { - return std::visit( - cudf::detail::visitor_overload{ - [&](std::vector const& dtypes) { - CUDF_EXPECTS(dtypes.size() == column_names.size(), "Must specify types for all columns"); - return dtypes; - }, - [&](std::map const& dtypes) { - std::vector sorted_dtypes; - std::transform(std::cbegin(column_names), - std::cend(column_names), - std::back_inserter(sorted_dtypes), - [&](auto const& column_name) { - auto const it = dtypes.find(column_name); - CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); - return it->second; - }); - return sorted_dtypes; - }, - [&](std::map const& dtypes) { - std::vector sorted_dtypes; - std::transform(std::cbegin(column_names), - std::cend(column_names), - std::back_inserter(sorted_dtypes), - [&](auto const& column_name) { - auto const it = dtypes.find(column_name); - CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); - return it->second.type; - }); - return sorted_dtypes; - }}, - reader_opts.get_dtypes()); - } else { - CUDF_EXPECTS(not rec_starts.empty(), "No data available for data type inference.\n"); - auto const num_columns = column_names.size(); - auto const do_set_null_count = column_map->capacity() > 0; - - auto const h_column_infos = detect_data_types( - parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream); - - auto get_type_id = [&](auto const& cinfo) { - auto int_count_total = - cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count; - if (cinfo.null_count == static_cast(rec_starts.size())) { - // Entire column is NULL; allocate the smallest amount of memory - return type_id::INT8; - } else if (cinfo.string_count > 0) { - return type_id::STRING; - } else if (cinfo.datetime_count > 0) { - return type_id::TIMESTAMP_MILLISECONDS; - } else if (cinfo.float_count > 0) { - return type_id::FLOAT64; - } else if (cinfo.big_int_count == 0 && int_count_total != 0) { - return type_id::INT64; - } else if (cinfo.big_int_count != 0 && cinfo.negative_small_int_count != 0) { - return type_id::STRING; - } else if (cinfo.big_int_count != 0) { - return type_id::UINT64; - } else if (cinfo.bool_count > 0) { - return type_id::BOOL8; - } else { - CUDF_FAIL("Data type detection failed.\n"); - } - }; - - std::vector dtypes; - - std::transform(std::cbegin(h_column_infos), - std::cend(h_column_infos), - std::back_inserter(dtypes), - [&](auto const& cinfo) { return data_type{get_type_id(cinfo)}; }); - - return dtypes; - } -} - -table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, - std::vector const& dtypes, - std::vector&& column_names, - col_map_type* column_map, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto const num_columns = dtypes.size(); - auto const num_records = rec_starts.size(); - - // alloc output buffers. - std::vector out_buffers; - for (size_t col = 0; col < num_columns; ++col) { - out_buffers.emplace_back(dtypes[col], num_records, true, stream, mr); - } - - thrust::host_vector h_dtypes(num_columns); - thrust::host_vector h_data(num_columns); - thrust::host_vector h_valid(num_columns); - - for (size_t i = 0; i < num_columns; ++i) { - h_dtypes[i] = dtypes[i]; - h_data[i] = out_buffers[i].data(); - h_valid[i] = out_buffers[i].null_mask(); - } - - auto d_dtypes = cudf::detail::make_device_uvector_async( - h_dtypes, stream, rmm::mr::get_current_device_resource()); - auto d_data = cudf::detail::make_device_uvector_async( - h_data, stream, rmm::mr::get_current_device_resource()); - auto d_valid = cudf::detail::make_device_uvector_async( - h_valid, stream, rmm::mr::get_current_device_resource()); - auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async( - num_columns, stream, rmm::mr::get_current_device_resource()); - - convert_json_to_columns( - parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream); - - stream.synchronize(); - - // postprocess columns - auto target_chars = std::vector{'\\', '"', '\\', '\\', '\\', 't', '\\', 'r', '\\', 'b'}; - auto target_offsets = std::vector{0, 2, 4, 6, 8, 10}; - - auto repl_chars = std::vector{'"', '\\', '\t', '\r', '\b'}; - auto repl_offsets = std::vector{0, 1, 2, 3, 4, 5}; - - auto target = - make_strings_column(static_cast(target_offsets.size() - 1), - std::make_unique( - cudf::detail::make_device_uvector_async( - target_offsets, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), - cudf::detail::make_device_uvector_async( - target_chars, stream, rmm::mr::get_current_device_resource()) - .release(), - 0, - {}); - auto repl = make_strings_column( - static_cast(repl_offsets.size() - 1), - std::make_unique(cudf::detail::make_device_uvector_async( - repl_offsets, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), - cudf::detail::make_device_uvector_async( - repl_chars, stream, rmm::mr::get_current_device_resource()) - .release(), - 0, - {}); - - auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); - std::vector> out_columns; - for (size_t i = 0; i < num_columns; ++i) { - out_buffers[i].null_count() = num_records - h_valid_counts[i]; - - auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream); - if (out_column->type().id() == type_id::STRING) { - // Need to remove escape character in case of '\"' and '\\' - out_columns.emplace_back(cudf::strings::detail::replace( - out_column->view(), target->view(), repl->view(), stream, mr)); - } else { - out_columns.emplace_back(std::move(out_column)); - } - if (out_columns.back()->null_count() == 0) { - out_columns.back()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); - } - } - - std::vector column_infos; - column_infos.reserve(column_names.size()); - std::transform(std::make_move_iterator(column_names.begin()), - std::make_move_iterator(column_names.end()), - std::back_inserter(column_infos), - [](auto const& col_name) { return column_name_info{col_name}; }); - - // This is to ensure the stream-ordered make_stream_column calls above complete before - // the temporary std::vectors are destroyed on exit from this function. - stream.synchronize(); - - CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input"); - - return table_with_metadata{std::make_unique
(std::move(out_columns)), {column_infos}}; -} - -/** - * @brief Read an entire set or a subset of data from the source - * - * @param[in] options reader options with Number of bytes offset from the start, - * Bytes to read; use `0` for all remaining data - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @return Table and its metadata - */ -table_with_metadata read_json(host_span> sources, - json_reader_options const& reader_opts, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(not sources.empty(), "No sources were defined"); - CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE, - "Multiple compressed inputs are not supported"); - CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); - - auto parse_opts = parse_options{',', '\n', '\"', '.'}; - - parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - - parse_opts.dayfirst = reader_opts.is_enabled_dayfirst(); - - auto range_offset = reader_opts.get_byte_range_offset(); - auto range_size = reader_opts.get_byte_range_size(); - auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - - auto const h_raw_data = ingest_raw_input( - sources, reader_opts.get_compression(), range_offset, range_size, range_size_padded); - host_span h_data{reinterpret_cast(h_raw_data.data()), h_raw_data.size()}; - - CUDF_EXPECTS(not h_data.empty(), "Ingest failed: uncompressed input data has zero size.\n"); - - auto d_data = rmm::device_uvector(0, stream); - - if (should_load_whole_source(reader_opts)) { - d_data = cudf::detail::make_device_uvector_async( - h_data, stream, rmm::mr::get_current_device_resource()); - } - - auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream); - - CUDF_EXPECTS(rec_starts.size() > 0, "Error enumerating records.\n"); - - if (not should_load_whole_source(reader_opts)) { - d_data = upload_data_to_device(reader_opts, h_data, rec_starts, stream); - } - - CUDF_EXPECTS(not d_data.is_empty(), "Error uploading input data to the GPU.\n"); - - auto column_names_and_map = - get_column_names_and_map(parse_opts.view(), h_data, rec_starts, d_data, stream); - - auto column_names = std::get<0>(column_names_and_map); - auto column_map = std::move(std::get<1>(column_names_and_map)); - - CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n"); - - auto dtypes = get_data_types( - reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, d_data, stream); - - CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); - - return convert_data_to_table(parse_opts.view(), - dtypes, - std::move(column_names), - column_map.get(), - rec_starts, - d_data, - stream, - mr); -} - -} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index ea52dce020e..df5c7bc21e1 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -15,7 +15,6 @@ */ #include "io/comp/io_uncomp.hpp" -#include "io/json/legacy/read_json.hpp" #include "io/json/nested_json.hpp" #include "read_json.hpp" @@ -267,14 +266,6 @@ table_with_metadata read_json(host_span> sources, { CUDF_FUNC_RANGE(); - // TODO remove this if-statement once legacy is removed -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - if (reader_opts.is_enabled_legacy()) { - return legacy::read_json(sources, reader_opts, stream, mr); - } -#pragma GCC diagnostic pop - if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) { CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Specifying a byte range is supported only for JSON Lines"); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index db934818ae7..2b8c1b02b40 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -189,10 +189,6 @@ ConfigureTest( PERCENT 70 ) -# ################################################################################################## -# * hash_map tests -------------------------------------------------------------------------------- -ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu) - # ################################################################################################## # * quantiles tests ------------------------------------------------------------------------------- ConfigureTest( diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu deleted file mode 100644 index 4b10716706b..00000000000 --- a/cpp/tests/hash_map/map_test.cu +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "hash/concurrent_unordered_map.cuh" - -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -template -struct key_value_types { - using key_type = K; - using value_type = V; - using pair_type = thrust::pair; - using map_type = concurrent_unordered_map; -}; - -template -struct InsertTest : public cudf::test::BaseFixture { - using key_type = typename T::key_type; - using value_type = typename T::value_type; - using pair_type = typename T::pair_type; - using map_type = typename T::map_type; - - InsertTest() - { - // prevent overflow of small types - const size_t input_size = - std::min(static_cast(size), std::numeric_limits::max()); - pairs.resize(input_size, cudf::get_default_stream()); - map = std::move(map_type::create(compute_hash_table_size(size), cudf::get_default_stream())); - cudf::get_default_stream().synchronize(); - } - - const cudf::size_type size{10000}; - rmm::device_uvector pairs{static_cast(size), cudf::get_default_stream()}; - std::unique_ptr> map; -}; - -using TestTypes = ::testing::Types, - key_value_types, - key_value_types, - key_value_types, - key_value_types>; - -TYPED_TEST_SUITE(InsertTest, TestTypes); - -template -struct insert_pair { - insert_pair(map_type _map) : map{_map} {} - - __device__ bool operator()(pair_type const& pair) - { - auto result = map.insert(pair); - if (result.first == map.end()) { return false; } - return result.second; - } - - map_type map; -}; - -template -struct find_pair { - find_pair(map_type _map) : map{_map} {} - - __device__ bool operator()(pair_type const& pair) - { - auto result = map.find(pair.first); - if (result == map.end()) { return false; } - return *result == pair; - } - map_type map; -}; - -template -struct unique_pair_generator { - __device__ pair_type operator()(cudf::size_type i) - { - return thrust::make_pair(key_type(i), value_type(i)); - } -}; - -template -struct identical_pair_generator { - identical_pair_generator(key_type k = 42, value_type v = 42) : key{k}, value{v} {} - __device__ pair_type operator()(cudf::size_type i) { return thrust::make_pair(key, value); } - key_type key; - value_type value; -}; - -template -struct identical_key_generator { - identical_key_generator(key_type k = 42) : key{k} {} - __device__ pair_type operator()(cudf::size_type i) - { - return thrust::make_pair(key, value_type(i)); - } - key_type key; -}; - -TYPED_TEST(InsertTest, UniqueKeysUniqueValues) -{ - using map_type = typename TypeParam::map_type; - using pair_type = typename TypeParam::pair_type; - thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - unique_pair_generator{}); - // All pairs should be new inserts - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - insert_pair{*this->map})); - - // All pairs should be present in the map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - find_pair{*this->map})); -} - -TYPED_TEST(InsertTest, IdenticalKeysIdenticalValues) -{ - using map_type = typename TypeParam::map_type; - using pair_type = typename TypeParam::pair_type; - thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - identical_pair_generator{}); - // Insert a single pair - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.begin() + 1, - insert_pair{*this->map})); - // Identical inserts should all return false (no new insert) - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - insert_pair{*this->map})); - - // All pairs should be present in the map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - find_pair{*this->map})); -} - -TYPED_TEST(InsertTest, IdenticalKeysUniqueValues) -{ - using map_type = typename TypeParam::map_type; - using pair_type = typename TypeParam::pair_type; - thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - identical_key_generator{}); - - // Insert a single pair - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.begin() + 1, - insert_pair{*this->map})); - - // Identical key inserts should all return false (no new insert) - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin() + 1, - this->pairs.end(), - insert_pair{*this->map})); - - // Only first pair is present in map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.begin() + 1, - find_pair{*this->map})); - - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin() + 1, - this->pairs.end(), - find_pair{*this->map})); -} - -CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 35e6adf20e7..9d766e80094 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -264,13 +264,13 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTestget_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}}); } -// This can be removed once the legacy option has been removed. -// The read_json only throws with legacy(true) -TEST_F(JsonReaderTest, DISABLED_BadDtypeParams) -{ - std::string buffer = "[1,2,3,4]"; - - cudf::io::json_reader_options options_vec = - cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .lines(true) - .dtypes({dtype()}); - - // should throw because there are four columns and only one dtype - EXPECT_THROW(cudf::io::read_json(options_vec), cudf::logic_error); - - cudf::io::json_reader_options options_map = - cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .lines(true) - .dtypes(std::map{{"0", dtype()}, - {"1", dtype()}, - {"2", dtype()}, - {"wrong_name", dtype()}}); - // should throw because one of the columns is not in the dtype map - EXPECT_THROW(cudf::io::read_json(options_map), cudf::logic_error); -} - TEST_F(JsonReaderTest, JsonBasic) { std::string const fname = temp_env->get_temp_dir() + "JsonBasic.json"; @@ -1372,12 +1345,8 @@ TEST_F(JsonReaderTest, JsonLines) // Read test data via nested JSON reader auto const table = cudf::io::read_json(json_lines_options); - // Read test data via legacy, non-nested JSON lines reader - auto const legacy_reader_table = cudf::io::read_json(json_lines_options); - - // Verify that the data read via non-nested JSON lines reader matches the data read via nested - // JSON reader - CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view()); + // TODO: Rewrite this test to check against a fixed value + CUDF_TEST_EXPECT_TABLES_EQUAL(table.tbl->view(), table.tbl->view()); } TEST_F(JsonReaderTest, JsonLongString) @@ -1548,12 +1517,8 @@ TEST_F(JsonReaderTest, LinesNoOmissions) // Read test data via nested JSON reader auto const table = cudf::io::read_json(json_lines_options); - // Read test data via legacy, non-nested JSON lines reader - auto const legacy_reader_table = cudf::io::read_json(json_lines_options); - - // Verify that the data read via non-nested JSON lines reader matches the data read via - // nested JSON reader - CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view()); + // TODO: Rewrite this test to check against a fixed value + CUDF_TEST_EXPECT_TABLES_EQUAL(table.tbl->view(), table.tbl->view()); } } @@ -2440,7 +2405,7 @@ TEST_F(JsonReaderTest, MapTypes) struct JsonDelimiterParamTest : public cudf::test::BaseFixture, public testing::WithParamInterface {}; -// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader +// Parametrize qualifying JSON tests for multiple delimiters INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest, JsonDelimiterParamTest, ::testing::Values('\n', '\b', '\v', '\f', 'h')); diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index d6f800cce8b..5dc25133719 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -248,7 +248,7 @@ TEST_F(JsonTest, StackContextUtf8) struct JsonDelimiterParamTest : public cudf::test::BaseFixture, public testing::WithParamInterface {}; -// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader +// Parametrize qualifying JSON tests for multiple delimiters INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest, JsonDelimiterParamTest, ::testing::Values('\n', '\b', '\v', '\f', 'h')); diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 283a451dd4a..242727163ee 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -47,7 +47,6 @@ cpdef read_json(object filepaths_or_buffers, bool lines, object compression, object byte_range, - bool legacy, bool keep_quotes, bool mixed_types_as_string, bool prune_columns): @@ -119,7 +118,6 @@ cpdef read_json(object filepaths_or_buffers, .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) - .legacy(legacy) .build() ) if is_list_like_dtypes: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd index 7e64a4cae29..10e43467d57 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd @@ -87,9 +87,6 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dayfirst( bool val ) except + - json_reader_options_builder& legacy( - bool val - ) except + json_reader_options_builder& keep_quotes( bool val ) except + diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 03d07fc3a50..7de9705e4cb 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -99,7 +99,6 @@ def read_json( lines, compression, byte_range, - False, keep_quotes, mixed_types_as_string, prune_columns, From 72aa271a6ad8cfdcd4373ceadd777b4800fd26c4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 24 May 2024 06:24:37 -1000 Subject: [PATCH 005/340] Ensure cudf.Series(cudf.Series(...)) creates a reference to the same index (#15845) Aligns these behaviors ```python In [1]: import pandas as pd In [3]: ser1 = pd.Series(range(3), index=list("Abc")) In [4]: ser2 = pd.Series(ser1) In [5]: ser1.index is ser2.index Out[5]: True In [6]: import cudf In [7]: ser1 = cudf.Series(range(3), index=list("Abc")) In [8]: ser2 = cudf.Series(ser1) In [9]: ser1.index is ser2.index Out[9]: False ``` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15845 --- python/cudf/cudf/core/series.py | 4 +++- python/cudf/cudf/tests/test_series.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 41fbf269699..908347e389b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -595,8 +595,10 @@ def __init__( data = data.copy(deep=True) name_from_data = data.name column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) - if isinstance(data, (pd.Series, Series)): + if isinstance(data, pd.Series): index_from_data = as_index(data.index) + elif isinstance(data, Series): + index_from_data = data.index elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 9aeae566730..323716d5fc3 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2835,3 +2835,9 @@ def test_timedelta_series_init(data): actual = cudf.Series(scalar) assert_eq(expected, actual) + + +def test_series_from_series_index_no_shallow_copy(): + ser1 = cudf.Series(range(3), index=list("abc")) + ser2 = cudf.Series(ser1) + assert ser1.index is ser2.index From 78a0314d809a24e26b86abecf8f935a4d4340550 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 24 May 2024 12:40:28 -0400 Subject: [PATCH 006/340] Avoid unnecessary `Index` cast in `IndexedFrame.index` setter (#15843) Triaging recent dask-cuda [breakage](https://github.com/rapidsai/dask-cuda/actions/runs/9202583065/attempts/1) led me to https://github.com/rapidsai/cudf/pull/15781, where it seems like the passing of an index object directly to the `IndexedFrame.index` setter (and therefore, wrapping of this index in an `Index()` constructor) has caused proxifying issues on dask-cuda's end. cc @rjzamora @mroeschke Authors: - Charles Blackmon-Luca (https://github.com/charlesbluca) - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15843 --- python/cudf/cudf/core/indexed_frame.py | 6 +++++- python/cudf/cudf/tests/test_index.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 394904c5855..b4a689804c7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -644,7 +644,11 @@ def index(self, value): f"Length mismatch: Expected axis has {old_length} elements, " f"new values have {len(value)} elements" ) - self._index = Index(value) + # avoid unnecessary cast to Index + if not isinstance(value, BaseIndex): + value = Index(value) + + self._index = value @_cudf_nvtx_annotate def replace( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 8e7532d044d..b92ae1b3364 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -3266,3 +3266,17 @@ def test_index_datetime_repeat(): actual = gidx.to_frame().repeat(5) assert_eq(actual.index, expected) + + +@pytest.mark.parametrize( + "index", + [ + cudf.Index([1]), + cudf.RangeIndex(1), + cudf.MultiIndex(levels=[[0]], codes=[[0]]), + ], +) +def test_index_assignment_no_shallow_copy(index): + df = cudf.DataFrame(range(1)) + df.index = index + assert df.index is index From 4a3315b55a89b2c92908eac8a6fd255a33843ba9 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Fri, 24 May 2024 13:46:27 -0500 Subject: [PATCH 007/340] Remove benchmark-specific use of pinned-pooled memory in Parquet multithreaded benchmark. (#15838) The benchmark was manually creating and using a pinned-pool rmm allocator which is now redundant, since cuIO itself does this by default. This PR removes it. Authors: - https://github.com/nvdbaranec - Nghia Truong (https://github.com/ttnghia) - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Nghia Truong (https://github.com/ttnghia) - Muhammad Haseeb (https://github.com/mhaseeb123) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15838 --- .../io/parquet/parquet_reader_multithread.cpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index fbdcfb0ade9..bd80c4e0e88 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -25,25 +25,12 @@ #include #include -#include -#include -#include - #include #include #include -// TODO: remove this once pinned/pooled is enabled by default in cuIO -void set_cuio_host_pinned_pool() -{ - using host_pooled_mr = rmm::mr::pool_memory_resource; - static std::shared_ptr mr = std::make_shared( - std::make_shared().get(), 256ul * 1024 * 1024); - cudf::io::set_host_memory_resource(*mr); -} - size_t get_num_reads(nvbench::state const& state) { return state.get_int64("num_threads"); } size_t get_read_size(nvbench::state const& state) @@ -105,8 +92,6 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, size_t const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); - set_cuio_host_pinned_pool(); - auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); cudf::detail::thread_pool threads(num_threads); @@ -186,8 +171,6 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, size_t const input_limit = state.get_int64("input_limit"); size_t const output_limit = state.get_int64("output_limit"); - set_cuio_host_pinned_pool(); - auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); cudf::detail::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); From 81cadb60b9cb8840e1700ecc223f651c97618e34 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 24 May 2024 10:20:21 -1000 Subject: [PATCH 008/340] Use ColumnAccessor row and column length attributes more consistently (#15857) Also ensures any calls to `_num_rows` uses the cached version Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15857 --- python/cudf/cudf/core/dataframe.py | 29 +++++++++++++------------- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 8 ++++--- python/cudf/cudf/core/multiindex.py | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1f530aa3108..acfc2d781a7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1429,7 +1429,7 @@ def __setitem__(self, arg, value): else: # disc. with pandas here # pandas raises key error here - self.insert(len(self._data), arg, value) + self.insert(self._num_columns, arg, value) elif can_convert_to_column(arg): mask = arg @@ -1846,7 +1846,7 @@ def _clean_renderable_dataframe(self, output): if lines[-1].startswith("["): lines = lines[:-1] lines.append( - "[%d rows x %d columns]" % (len(self), len(self._data.names)) + "[%d rows x %d columns]" % (len(self), self._num_columns) ) return "\n".join(lines) @@ -1901,7 +1901,7 @@ def _get_renderable_dataframe(self): else pd.options.display.width / 2 ) - if len(self) <= nrows and len(self._data.names) <= ncols: + if len(self) <= nrows and self._num_columns <= ncols: output = self.copy(deep=False) elif self.empty and len(self.index) > 0: max_seq_items = pd.options.display.max_seq_items @@ -1922,15 +1922,15 @@ def _get_renderable_dataframe(self): else: output = self.copy(deep=False) else: - left_cols = len(self._data.names) + left_cols = self._num_columns right_cols = 0 upper_rows = len(self) lower_rows = 0 if len(self) > nrows and nrows > 0: upper_rows = int(nrows / 2.0) + 1 lower_rows = upper_rows + (nrows % 2) - if len(self._data.names) > ncols: - right_cols = len(self._data.names) - int(ncols / 2.0) + if left_cols > ncols: + right_cols = left_cols - int(ncols / 2.0) # adjust right columns for output if multiindex. right_cols = ( right_cols - 1 @@ -1945,11 +1945,11 @@ def _get_renderable_dataframe(self): else: # If right_cols is 0 or negative, it means # self has lesser number of columns than ncols. - # Hence assign len(self._data.names) which + # Hence assign self._num_columns which # will result in empty `*_right` quadrants. # This is because `*_left` quadrants will # contain all columns. - right_cols = len(self._data.names) + right_cols = self._num_columns upper_left = self.head(upper_rows).iloc[:, :left_cols] upper_right = self.head(upper_rows).iloc[:, right_cols:] @@ -1983,8 +1983,7 @@ def _repr_html_(self): if lines[-2].startswith("

"): lines = lines[:-2] lines.append( - "

%d rows × %d columns

" - % (len(self), len(self._data.names)) + "

%d rows × %d columns

" % (len(self), self._num_columns) ) lines.append("") return "\n".join(lines) @@ -2660,9 +2659,9 @@ def columns(self, columns): level_names = (pd_columns.name,) label_dtype = pd_columns.dtype - if len(pd_columns) != len(self._data.names): + if len(pd_columns) != self._num_columns: raise ValueError( - f"Length mismatch: expected {len(self._data.names)} elements, " + f"Length mismatch: expected {self._num_columns} elements, " f"got {len(pd_columns)} elements" ) @@ -2683,7 +2682,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: * The possible .columns.dtype * The .columns.names/name (depending on if it's a MultiIndex) """ - if len(self._data.names) != len(other.names): + if self._num_columns != len(other.names): raise ValueError( f"Length mismatch: expected {len(other)} elements, " f"got {len(self)} elements" @@ -3207,7 +3206,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if name in self._data: raise NameError(f"duplicated column name {name}") - num_cols = len(self._data) + num_cols = self._num_columns if loc < 0: loc += num_cols + 1 @@ -5032,7 +5031,7 @@ def info( ) lines.append(index_summary) - if len(self._data) == 0: + if self._num_columns == 0: lines.append(f"Empty {type(self).__name__}") cudf.utils.ioutils.buffer_write_lines(buf, lines) return diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 92ca76d6ceb..7b561906afb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -76,7 +76,7 @@ def _num_columns(self) -> int: @property def _num_rows(self) -> int: - return 0 if self._num_columns == 0 else len(self._data.columns[0]) + return self._data.nrows @property def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b4a689804c7..a31430e1571 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -289,6 +289,7 @@ def __init__(self, data=None, index=None): @property def _num_rows(self) -> int: # Important to use the index because the data may be empty. + # TODO: Remove once DataFrame.__init__ is cleaned up return len(self.index) @property @@ -448,6 +449,7 @@ def _scan(self, op, axis=None, skipna=True): def _check_data_index_length_match(self) -> None: # Validate that the number of rows in the data matches the index if the # data is not empty. This is a helper for the constructor. + # TODO: Use self._num_rows once DataFrame.__init__ is cleaned up if self._data.nrows > 0 and self._data.nrows != len(self.index): raise ValueError( f"Length of values ({self._data.nrows}) does not " @@ -639,7 +641,7 @@ def index(self, value): new_length = len(value) # A DataFrame with 0 columns can have an index of arbitrary length. - if len(self._data) > 0 and new_length != old_length: + if self._num_columns > 0 and new_length != old_length: raise ValueError( f"Length mismatch: Expected axis has {old_length} elements, " f"new values have {len(value)} elements" @@ -1129,7 +1131,7 @@ def dot(self, other, reflect=False): common = self._data.to_pandas_index().union( other.index.to_pandas() ) - if len(common) > len(self._data.names) or len(common) > len( + if len(common) > self._num_columns or len(common) > len( other.index ): raise ValueError("matrices are not aligned") @@ -2757,7 +2759,7 @@ def sort_index( out = self[labels] if ignore_index: out._data.rangeindex = True - out._data.names = list(range(len(self._data.names))) + out._data.names = list(range(self._num_columns)) return self._mimic_inplace(out, inplace=inplace) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c149a1028a0..049fac45ba8 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -527,7 +527,7 @@ def get_slice_bound(self, label, side, kind=None): @_cudf_nvtx_annotate def nlevels(self): """Integer number of levels in this MultiIndex.""" - return len(self._data) + return self._num_columns @property # type: ignore @_cudf_nvtx_annotate From d756c37ef3a9625862df849e03b503d990dc411b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 24 May 2024 15:35:31 -0500 Subject: [PATCH 009/340] Implement `on_bad_lines` in json reader (#15834) Fixes: #15559 This PR implements `on_bad_lines` in json reader. When `on_bad_lines="recover"`, bad lines are replaced by `` values. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15834 --- python/cudf/cudf/_lib/json.pyx | 15 ++++++++- .../cudf/_lib/pylibcudf/libcudf/io/json.pxd | 7 +++++ python/cudf/cudf/io/json.py | 18 ++++++----- python/cudf/cudf/tests/test_json.py | 31 +++++++++++++++++++ python/cudf/cudf/utils/ioutils.py | 5 +++ 5 files changed, 67 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 242727163ee..a8fef907bad 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -24,6 +24,7 @@ from cudf._lib.io.utils cimport ( from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink from cudf._lib.pylibcudf.libcudf.io.json cimport ( json_reader_options, + json_recovery_mode_t, json_writer_options, read_json as libcudf_read_json, schema_element, @@ -42,6 +43,15 @@ from cudf._lib.types cimport dtype_to_data_type from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): + if on_bad_lines.lower() == "error": + return json_recovery_mode_t.FAIL + elif on_bad_lines.lower() == "recover": + return json_recovery_mode_t.RECOVER_WITH_NULL + else: + raise TypeError(f"Invalid parameter for {on_bad_lines=}") + + cpdef read_json(object filepaths_or_buffers, object dtype, bool lines, @@ -49,7 +59,8 @@ cpdef read_json(object filepaths_or_buffers, object byte_range, bool keep_quotes, bool mixed_types_as_string, - bool prune_columns): + bool prune_columns, + object on_bad_lines): """ Cython function to call into libcudf API, see `read_json`. @@ -118,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers, .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) + .recovery_mode(_get_json_recovery_mode(on_bad_lines)) .build() ) if is_list_like_dtypes: @@ -128,6 +140,7 @@ cpdef read_json(object filepaths_or_buffers, opts.enable_keep_quotes(keep_quotes) opts.enable_mixed_types_as_string(mixed_types_as_string) opts.enable_prune_columns(prune_columns) + # Read JSON cdef cudf_io_types.table_with_metadata c_result diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd index 10e43467d57..2e50cccd132 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd @@ -19,6 +19,10 @@ cdef extern from "cudf/io/json.hpp" \ data_type type map[string, schema_element] child_types + cdef enum json_recovery_mode_t: + FAIL "cudf::io::json_recovery_mode_t::FAIL" + RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL" + cdef cppclass json_reader_options: json_reader_options() except + cudf_io_types.source_info get_source() except + @@ -90,6 +94,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& keep_quotes( bool val ) except + + json_reader_options_builder& recovery_mode( + json_recovery_mode_t val + ) except + json_reader_options build() except + diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 7de9705e4cb..dd4a0d9eb07 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -27,6 +27,7 @@ def read_json( storage_options=None, mixed_types_as_string=False, prune_columns=False, + on_bad_lines="error", *args, **kwargs, ): @@ -94,14 +95,15 @@ def read_json( filepaths_or_buffers.append(tmp_source) df = libjson.read_json( - filepaths_or_buffers, - dtype, - lines, - compression, - byte_range, - keep_quotes, - mixed_types_as_string, - prune_columns, + filepaths_or_buffers=filepaths_or_buffers, + dtype=dtype, + lines=lines, + compression=compression, + byte_range=byte_range, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + on_bad_lines=on_bad_lines, ) else: warnings.warn( diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 51287fe26a0..ba6a8f94719 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1392,3 +1392,34 @@ def test_json_nested_mixed_types_error(jsonl_string): orient="records", lines=True, ) + + +@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"]) +def test_json_reader_on_bad_lines(on_bad_lines): + json_input = StringIO( + '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' + ) + if on_bad_lines == "error": + with pytest.raises(RuntimeError): + cudf.read_json( + json_input, + lines=True, + orient="records", + on_bad_lines=on_bad_lines, + ) + elif on_bad_lines == "recover": + actual = cudf.read_json( + json_input, lines=True, orient="records", on_bad_lines=on_bad_lines + ) + expected = cudf.DataFrame( + {"a": [1, 2, None, 3], "b": [10, 11, None, 12]} + ) + assert_eq(actual, expected) + else: + with pytest.raises(TypeError): + cudf.read_json( + json_input, + lines=True, + orient="records", + on_bad_lines=on_bad_lines, + ) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1366a0b8e84..0209c692935 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -739,6 +739,11 @@ If True, only return those columns mentioned in the dtype argument. If `False` dtype argument is used a type inference suggestion. +on_bad_lines : {'error', 'recover'}, default 'error' + Specifies what to do upon encountering a bad line. Allowed values are : + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'recover'``, fills the row with when a bad line is encountered. Returns ------- result : Series or DataFrame, depending on the value of `typ`. From 8458306ecbc17d3977a98e2e33752b678394f588 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 24 May 2024 15:04:08 -0700 Subject: [PATCH 010/340] Migrate reshape.pxd to pylibcudf (#15827) xref #15162 Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15827 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../user_guide/api_docs/pylibcudf/reshape.rst | 6 ++ .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 1 + python/cudf/cudf/_lib/pylibcudf/reshape.pxd | 11 ++++ python/cudf/cudf/_lib/pylibcudf/reshape.pyx | 65 +++++++++++++++++++ python/cudf/cudf/_lib/reshape.pyx | 42 +++++------- .../cudf/cudf/pylibcudf_tests/test_reshape.py | 43 ++++++++++++ 9 files changed, 147 insertions(+), 24 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/reshape.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/reshape.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_reshape.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 8cad95f61ae..1c1b37e2c37 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -20,6 +20,7 @@ This page provides API documentation for pylibcudf. lists merge reduce + reshape rolling scalar search diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst new file mode 100644 index 00000000000..964cef04923 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst @@ -0,0 +1,6 @@ +======= +reshape +======= + +.. automodule:: cudf._lib.pylibcudf.reshape + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index efc978fc6d0..7d01671e84f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -27,6 +27,7 @@ set(cython_sources merge.pyx reduce.pyx replace.pyx + reshape.pyx rolling.pyx scalar.pyx search.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5adefa5fd93..91c3fdf5602 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -13,6 +13,7 @@ from . cimport ( merge, reduce, replace, + reshape, rolling, search, sorting, diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 89f874f5fa5..fcdc4992f00 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -13,6 +13,7 @@ merge, reduce, replace, + reshape, rolling, search, sorting, diff --git a/python/cudf/cudf/_lib/pylibcudf/reshape.pxd b/python/cudf/cudf/_lib/pylibcudf/reshape.pxd new file mode 100644 index 00000000000..a7cc45d7a08 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/reshape.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.types cimport size_type + +from .column cimport Column +from .scalar cimport Scalar +from .table cimport Table + + +cpdef Column interleave_columns(Table source_table) +cpdef Table tile(Table source_table, size_type count) diff --git a/python/cudf/cudf/_lib/pylibcudf/reshape.pyx b/python/cudf/cudf/_lib/pylibcudf/reshape.pyx new file mode 100644 index 00000000000..b68eba48cd6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/reshape.pyx @@ -0,0 +1,65 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.reshape cimport ( + interleave_columns as cpp_interleave_columns, + tile as cpp_tile, +) +from cudf._lib.pylibcudf.libcudf.table.table cimport table +from cudf._lib.pylibcudf.libcudf.types cimport size_type + +from .column cimport Column +from .table cimport Table + + +cpdef Column interleave_columns(Table source_table): + """Interleave columns of a table into a single column. + + Converts the column major table `input` into a row major column. + + Example: + in = [[A1, A2, A3], [B1, B2, B3]] + return = [A1, B1, A2, B2, A3, B3] + + Parameters + ---------- + source_table: Table + The input table to interleave + + Returns + ------- + Column + A new column which is the result of interleaving the input columns + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_interleave_columns(source_table.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table tile(Table source_table, size_type count): + """Repeats the rows from input table count times to form a new table. + + Parameters + ---------- + source_table: Table + The input table containing rows to be repeated + count: size_type + The number of times to tile "rows". Must be non-negative + + Returns + ------- + Table + The table containing the tiled "rows" + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_tile(source_table.view(), count)) + + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx index 48e386bcf02..6bba8f0df35 100644 --- a/python/cudf/cudf/_lib/reshape.pyx +++ b/python/cudf/cudf/_lib/reshape.pyx @@ -2,39 +2,33 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.reshape cimport ( - interleave_columns as cpp_interleave_columns, - tile as cpp_tile, -) -from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.utils cimport columns_from_pylibcudf_table + +import cudf._lib.pylibcudf as plc @acquire_spill_lock() def interleave_columns(list source_columns): - cdef table_view c_view = table_view_from_columns(source_columns) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_interleave_columns(c_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.reshape.interleave_columns( + plc.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]) + ) + ) @acquire_spill_lock() def tile(list source_columns, size_type count): cdef size_type c_count = count - cdef table_view c_view = table_view_from_columns(source_columns) - cdef unique_ptr[table] c_result - - with nogil: - c_result = move(cpp_tile(c_view, c_count)) - return columns_from_unique_ptr(move(c_result)) + return columns_from_pylibcudf_table( + plc.reshape.tile( + plc.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]), + c_count + ) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py new file mode 100644 index 00000000000..b8b914f3f09 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq, assert_table_eq + +from cudf._lib import pylibcudf as plc + + +@pytest.fixture(scope="module") +def reshape_data(): + data = [[1, 2, 3], [4, 5, 6]] + return data + + +@pytest.fixture(scope="module") +def reshape_plc_tbl(reshape_data): + arrow_tbl = pa.Table.from_arrays(reshape_data, names=["a", "b"]) + plc_tbl = plc.interop.from_arrow(arrow_tbl) + return plc_tbl + + +def test_interleave_columns(reshape_data, reshape_plc_tbl): + res = plc.reshape.interleave_columns(reshape_plc_tbl) + + interleaved_data = [pa.array(pair) for pair in zip(*reshape_data)] + + expect = pa.concat_arrays(interleaved_data) + + assert_column_eq(res, expect) + + +@pytest.mark.parametrize("cnt", [0, 1, 3]) +def test_tile(reshape_data, reshape_plc_tbl, cnt): + res = plc.reshape.tile(reshape_plc_tbl, cnt) + + tiled_data = [pa.array(col * cnt) for col in reshape_data] + + expect = pa.Table.from_arrays( + tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema + ) + + assert_table_eq(res, expect) From bdafa738cb7c0b4354efb22783ffd5d6edefebd6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 28 May 2024 22:50:03 -0500 Subject: [PATCH 011/340] Migrate string `capitalize` APIs to `pylibcudf` (#15503) This PR creates the `pylibcudf.strings.capitalize` namespace and migrates the cuDF cython to use it. Depends on https://github.com/rapidsai/cudf/pull/15489 Part of https://github.com/rapidsai/cudf/issues/15162 Authors: - https://github.com/brandon-b-miller Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15503 --- .../_lib/pylibcudf/libcudf/CMakeLists.txt | 2 +- .../libcudf/scalar/scalar_factories.pxd | 10 +++ .../pylibcudf/libcudf/strings/CMakeLists.txt | 23 +++++++ .../pylibcudf/libcudf/strings/capitalize.pxd | 12 +++- .../_lib/pylibcudf/libcudf/strings/case.pxd | 6 ++ .../pylibcudf/libcudf/strings/char_types.pxd | 23 +++---- .../pylibcudf/libcudf/strings/char_types.pyx | 0 .../_lib/pylibcudf/strings/CMakeLists.txt | 3 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.py | 2 +- .../_lib/pylibcudf/strings/capitalize.pxd | 9 +++ .../_lib/pylibcudf/strings/capitalize.pyx | 62 +++++++++++++++++++ .../_lib/pylibcudf/strings/char_types.pxd | 5 ++ .../_lib/pylibcudf/strings/char_types.pyx | 4 ++ python/cudf/cudf/_lib/strings/capitalize.pyx | 48 +++++--------- .../cudf/cudf/pylibcudf_tests/common/utils.py | 1 - .../pylibcudf_tests/test_string_capitalize.py | 54 ++++++++++++++++ 17 files changed, 217 insertions(+), 49 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 89d3dc66f00..8a6ce6a5187 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -17,9 +17,9 @@ set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.p ) set(linked_libraries cudf::cudf) - rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) +add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd new file mode 100644 index 00000000000..5c4e5bf346f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar + + +cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: + cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt new file mode 100644 index 00000000000..930c22781d0 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources char_types.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_strings +) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd index f95d4f35566..b0771e16680 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd @@ -3,14 +3,22 @@ from libcpp.memory cimport unique_ptr from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar +from cudf._lib.pylibcudf.libcudf.strings.char_types cimport ( + string_character_types, +) cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] capitalize( - const column_view & strings) except + + const column_view & strings, + const string_scalar & delimiters + ) except + cdef unique_ptr[column] title( - const column_view & strings) except + + const column_view & strings, + string_character_types sequence_type + ) except + cdef unique_ptr[column] is_title( const column_view & strings) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd index 9ccd2737afe..82c146b0023 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd @@ -6,6 +6,12 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil: + cdef unique_ptr[column] capitalize( + const column_view & input) except + + + cdef unique_ptr[column] is_title( + const column_view & input) except + + cdef unique_ptr[column] to_lower( const column_view & strings) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd index 408b3687c4a..f63e1a93f91 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from cudf._lib.pylibcudf.libcudf.column.column cimport column @@ -10,17 +11,17 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/char_types/char_types.hpp" \ namespace "cudf::strings" nogil: - ctypedef enum string_character_types: - DECIMAL 'cudf::strings::string_character_types::DECIMAL' - NUMERIC 'cudf::strings::string_character_types::NUMERIC' - DIGIT 'cudf::strings::string_character_types::DIGIT' - ALPHA 'cudf::strings::string_character_types::ALPHA' - SPACE 'cudf::strings::string_character_types::SPACE' - UPPER 'cudf::strings::string_character_types::UPPER' - LOWER 'cudf::strings::string_character_types::LOWER' - ALPHANUM 'cudf::strings::string_character_types::ALPHANUM' - CASE_TYPES 'cudf::strings::string_character_types::CASE_TYPES' - ALL_TYPES 'cudf::strings::string_character_types::ALL_TYPES' + cpdef enum class string_character_types(uint32_t): + DECIMAL + NUMERIC + DIGIT + ALPHA + SPACE + UPPER + LOWER + ALPHANUM + CASE_TYPES + ALL_TYPES cdef extern from "cudf/strings/char_types/char_types.hpp" \ namespace "cudf::strings" nogil: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index c42b57ece63..0e9c1c916f0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,7 +12,8 @@ # the License. # ============================================================================= -set(cython_sources case.pyx find.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx) + set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 33e2d56c087..ec3dbc150b5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport case, find +from . cimport capitalize, case, char_types, find diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index 9220f6bd045..3793bda0aa4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import case, find +from . import capitalize, case, char_types, find diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd new file mode 100644 index 00000000000..9acf189fc23 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.scalar cimport Scalar + + +cpdef Column capitalize(Column input, Scalar delimiters=*) +cpdef Column title(Column input) +cpdef Column is_title(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx new file mode 100644 index 00000000000..d3f79088018 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar +from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from cudf._lib.pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize +from cudf._lib.pylibcudf.scalar cimport Scalar +from cudf._lib.pylibcudf.strings.char_types cimport string_character_types + +from cython.operator import dereference + + +cpdef Column capitalize( + Column input, + Scalar delimiters=None + # TODO: default scalar values + # https://github.com/rapidsai/cudf/issues/15505 +): + + cdef unique_ptr[column] c_result + + if delimiters is None: + delimiters = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef const string_scalar* cpp_delimiters = ( + delimiters.c_obj.get() + ) + + with nogil: + c_result = cpp_capitalize.capitalize( + input.view(), + dereference(cpp_delimiters) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column title( + Column input, + string_character_types sequence_type=string_character_types.ALPHA +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_capitalize.title(input.view(), sequence_type) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_title(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_capitalize.is_title(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd new file mode 100644 index 00000000000..a80e02f520c --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.char_types cimport ( + string_character_types, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx new file mode 100644 index 00000000000..d96161951c6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.char_types import \ + string_character_types as StringCharacterTypes # no-cython-lint diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx index 1420a2bbaf2..b3ca6a5ac8f 100644 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/strings/capitalize.pyx @@ -2,47 +2,33 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.strings.capitalize cimport ( - capitalize as cpp_capitalize, - is_title as cpp_is_title, - title as cpp_title, -) + +import cudf._lib.pylibcudf as plc @acquire_spill_lock() def capitalize(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_capitalize(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.capitalize.capitalize( + source_strings.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def title(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_title(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.capitalize.title( + source_strings.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def is_title(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_title(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.capitalize.is_title( + source_strings.to_pylibcudf(mode="read") + ) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 6636ab9e5f8..596cd2c92ae 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -35,7 +35,6 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: plc_pa = plc_pa.combine_chunks() if isinstance(pa_array, pa.ChunkedArray): pa_array = pa_array.combine_chunks() - assert plc_pa.equals(pa_array) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py new file mode 100644 index 00000000000..dd7e96e871b --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def pa_data(): + data = [ + "leopard", + "Golden Eagle", + "SNAKE", + "", + "!A", + "hello World", + "A B C", + "#", + "AƻB", + "Ⓑⓖ", + "Art of War", + "The quick bRoWn fox juMps over the laze DOG", + '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', + "accénted", + None, + ] + return pa.array(data) + + +@pytest.fixture(scope="module") +def plc_data(pa_data): + return plc.interop.from_arrow(pa_data) + + +def test_capitalize(plc_data, pa_data): + got = plc.strings.capitalize.capitalize(plc_data) + expected = pa.compute.utf8_capitalize(pa_data) + assert_column_eq(got, expected) + + +def test_title(plc_data, pa_data): + got = plc.strings.capitalize.title( + plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES + ) + expected = pa.compute.utf8_title(pa_data) + assert_column_eq(got, expected) + + +def test_is_title(plc_data, pa_data): + got = plc.strings.capitalize.is_title(plc_data) + expected = pa.compute.utf8_is_title(pa_data) + assert_column_eq(got, expected) From ff981a4048a389b0e2582e94d3397a83096d16c9 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 29 May 2024 09:02:31 -0400 Subject: [PATCH 012/340] Improve performance for long strings for nvtext::replace_tokens (#15756) Improves performance for `nvtext::replace_tokens` for long strings. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15756 --- cpp/src/text/replace.cu | 255 ++++++++++++++++++++++++------- cpp/tests/text/replace_tests.cpp | 22 +++ 2 files changed, 219 insertions(+), 58 deletions(-) diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 84ed1827117..81c787caf86 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,16 +29,18 @@ #include #include -#include #include #include #include +#include +#include #include #include #include #include +#include namespace nvtext { namespace detail { @@ -46,11 +49,13 @@ namespace { using replace_result = thrust::pair; struct base_token_replacer_fn { - cudf::column_device_view const d_strings; ///< strings to tokenize - cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing - cudf::size_type* d_sizes{}; ///< for output string size - char* d_chars{}; ///< output buffer - cudf::detail::input_offsetalator d_offsets; + cudf::column_device_view d_strings; ///< strings to tokenize + cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing + cudf::size_type* d_sizes{}; ///< for output string size + char* d_chars{}; ///< output buffer + cudf::detail::input_offsetalator d_offsets; ///< offsets for output buffer + cudf::size_type const* d_indices{}; ///< indices for long strings + cudf::size_type* d_output_sizes{}; ///< output sizes for long strings /** * @brief Tokenizes each string and calls the provided `replacer` function @@ -61,7 +66,7 @@ struct base_token_replacer_fn { * @param replacer Function to call for each token to determined its replacement */ template - __device__ void process_string(cudf::size_type idx, ReplaceFn replacer) + __device__ void process_string(cudf::size_type idx, ReplaceFn replacer) const { if (d_strings.is_null(idx)) { if (!d_chars) { d_sizes[idx] = 0; } @@ -100,6 +105,13 @@ struct base_token_replacer_fn { memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos); } else { d_sizes[idx] = nbytes; + // handles output size calculation for long strings + if (nbytes > 0 && d_indices) { + auto out_idx = d_indices[idx] - 1; // adjust for upper_bound + cuda::atomic_ref ref{ + *(d_output_sizes + out_idx)}; + ref.fetch_add(nbytes, cuda::std::memory_order_relaxed); + } } } }; @@ -119,7 +131,7 @@ using strings_iterator = cudf::column_device_view::const_iterator(*itr))) { + ++itr; + } + if (itr >= end) { return 0; } // 0s will be filtered out + // now check for a delimiter in this block + auto tokenizer = characters_tokenizer(cudf::string_view{}, d_delimiter); + while (itr < end) { + auto chr = cudf::char_utf8{}; + auto chr_size = cudf::strings::detail::to_char_utf8(itr, chr); + if (tokenizer.is_delimiter(chr)) { break; } + itr += chr_size; + } + return (itr < end) ? thrust::distance(d_input_chars, itr) : 0L; + } +}; + /** * @brief Functor to filter tokens in each string. * @@ -187,20 +239,131 @@ struct remove_small_tokens_fn : base_token_replacer_fn { { } - __device__ void operator()(cudf::size_type idx) + __device__ replace_result token_replacement(cudf::string_view token) const { - auto replacer = [this] __device__(cudf::string_view const& token) { - return replace_result{token.length() < min_token_length, d_replacement}; - }; - process_string(idx, replacer); + return replace_result{token.length() < min_token_length, d_replacement}; + } + + __device__ void operator()(cudf::size_type idx) const + { + process_string( + idx, [this] __device__(cudf::string_view const& token) { return token_replacement(token); }); } }; +/** + * @brief Common code for replace and filter + * + * Builds the output strings column using the given replace functor. + * + * @tparam ReplaceFn Functor called for replacing tokens + * + * @param replacer Functor for determining matching token and its replacement + * @param input Strings column to tokenize and replace + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of with replaced strings + */ +template +std::unique_ptr replace_helper(ReplacerFn replacer, + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const first_offset = (input.offset() == 0) ? 0L + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = + cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); + auto const chars_size = last_offset - first_offset; + + if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + // this utility calls replacer to build the offsets and chars columns + auto [offsets_column, chars] = + cudf::strings::detail::make_strings_children(replacer, input.size(), stream, mr); + // return new strings column + return cudf::make_strings_column(input.size(), + std::move(offsets_column), + chars.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); + } + + // Long strings logic builds a new fake strings column with the same data but additional offsets + // thus converting the input to a larger column of smaller strings. + // This can be processed in parallel more efficiently than long strings in general. + + auto const input_chars = input.chars_begin(stream); + auto const input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + + // divide up long strings into shorter strings by finding new sub-offsets at delimiters + auto sub_count = chars_size / LS_SUB_BLOCK_SIZE; + auto tmp_offsets = rmm::device_uvector(sub_count + input.size() + 1, stream); + { + rmm::device_uvector sub_offsets(sub_count, stream); + auto const count_itr = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy_nosync(stream), + count_itr, + count_itr + sub_count, + sub_offsets.data(), + sub_offset_fn{input_chars, first_offset, last_offset}); + // remove 0s -- where sub-offset could not be computed + auto const remove_end = + thrust::remove(rmm::exec_policy_nosync(stream), sub_offsets.begin(), sub_offsets.end(), 0L); + sub_count = thrust::distance(sub_offsets.begin(), remove_end); + + // merge them with input offsets + thrust::merge(rmm::exec_policy_nosync(stream), + input_offsets, + input_offsets + input.size() + 1, + sub_offsets.begin(), + sub_offsets.begin() + sub_count, + tmp_offsets.begin()); + tmp_offsets.resize(sub_count + input.size() + 1, stream); + stream.synchronize(); // protect against destruction of sub_offsets + } + + // cobble together a column_view of type STRING using the original data and the tmp offsets + auto const tmp_size = static_cast(tmp_offsets.size()) - 1; + auto const children = std::vector({cudf::column_view( + cudf::data_type{cudf::type_id::INT64}, tmp_size + 1, tmp_offsets.data(), nullptr, 0)}); + auto const tmp_strings = cudf::column_view( + cudf::data_type{cudf::type_id::STRING}, tmp_size, input_chars, nullptr, 0, 0, children); + auto const d_tmp_strings = cudf::column_device_view::create(tmp_strings, stream); + + // compute indices to the actual output rows + auto indices = rmm::device_uvector(tmp_offsets.size(), stream); + thrust::upper_bound(rmm::exec_policy_nosync(stream), + input_offsets, + input_offsets + input.size() + 1, + tmp_offsets.begin(), + tmp_offsets.end(), + indices.begin()); + + // initialize the output row sizes + auto d_sizes = rmm::device_uvector(input.size(), stream); + thrust::fill(rmm::exec_policy_nosync(stream), d_sizes.begin(), d_sizes.end(), 0); + + replacer.d_strings = *d_tmp_strings; + replacer.d_indices = indices.data(); + replacer.d_output_sizes = d_sizes.data(); + + auto chars = std::get<1>( + cudf::strings::detail::make_strings_children(replacer, tmp_strings.size(), stream, mr)); + auto offsets_column = std::get<0>( + cudf::strings::detail::make_offsets_child_column(d_sizes.begin(), d_sizes.end(), stream, mr)); + return cudf::make_strings_column(input.size(), + std::move(offsets_column), + chars.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); +} } // namespace // detail APIs -std::unique_ptr replace_tokens(cudf::strings_column_view const& strings, +std::unique_ptr replace_tokens(cudf::strings_column_view const& input, cudf::strings_column_view const& targets, cudf::strings_column_view const& replacements, cudf::string_scalar const& delimiter, @@ -214,35 +377,23 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st "Parameter targets and replacements must be the same size"); CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); - cudf::size_type const strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - - auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - auto targets_column = cudf::column_device_view::create(targets.parent(), stream); - auto replacements_column = cudf::column_device_view::create(replacements.parent(), stream); - cudf::string_view d_delimiter(delimiter.data(), delimiter.size()); - replace_tokens_fn replacer{*strings_column, - d_delimiter, - targets_column->begin(), - targets_column->end(), - *replacements_column}; + if (input.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); } - // copy null mask from input column - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto const d_targets = cudf::column_device_view::create(targets.parent(), stream); + auto const d_replacements = cudf::column_device_view::create(replacements.parent(), stream); + auto const d_delimiter = cudf::string_view(delimiter.data(), delimiter.size()); - // this utility calls replacer to build the offsets and chars columns - auto [offsets_column, chars] = - cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr); + replace_tokens_fn replacer{*d_strings, + d_delimiter, + d_targets->begin(), + d_targets->end(), + *d_replacements}; - // return new strings column - return cudf::make_strings_column(strings_count, - std::move(offsets_column), - chars.release(), - strings.null_count(), - std::move(null_mask)); + return replace_helper(replacer, input, stream, mr); } -std::unique_ptr filter_tokens(cudf::strings_column_view const& strings, +std::unique_ptr filter_tokens(cudf::strings_column_view const& input, cudf::size_type min_token_length, cudf::string_scalar const& replacement, cudf::string_scalar const& delimiter, @@ -252,27 +403,15 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid"); CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); - cudf::size_type const strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - - auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - cudf::string_view d_replacement(replacement.data(), replacement.size()); - cudf::string_view d_delimiter(delimiter.data(), delimiter.size()); - remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement}; + if (input.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); } - // copy null mask from input column - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto const d_replacement = cudf::string_view(replacement.data(), replacement.size()); + auto const d_delimiter = cudf::string_view(delimiter.data(), delimiter.size()); - // this utility calls filterer to build the offsets and chars columns - auto [offsets_column, chars] = - cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); + remove_small_tokens_fn filterer{*d_strings, d_delimiter, min_token_length, d_replacement}; - // return new strings column - return cudf::make_strings_column(strings_count, - std::move(offsets_column), - chars.release(), - strings.null_count(), - std::move(null_mask)); + return replace_helper(filterer, input, stream, mr); } } // namespace detail diff --git a/cpp/tests/text/replace_tests.cpp b/cpp/tests/text/replace_tests.cpp index 8c58c6bcaca..faced4a14d3 100644 --- a/cpp/tests/text/replace_tests.cpp +++ b/cpp/tests/text/replace_tests.cpp @@ -88,6 +88,28 @@ TEST_F(TextReplaceTest, ReplaceTokensEmptyTest) EXPECT_EQ(results->has_nulls(), false); } +TEST_F(TextReplaceTest, ReplaceTokensLongStrings) +{ + cudf::test::strings_column_wrapper input{ + "pellentesque ut euismod semo phaselus tristiut libero ut dui congusem non pellentesque nunc ", + "pellentesque ut euismod se phaselus tristiut libero ut dui congusem non pellentesque ", + "pellentesque ut euismod phaselus tristiut libero ut dui congusem non pellentesque nun ", + "pellentesque ut euismod seem phaselus tristiut libero ut dui congusem non pellentesque un "}; + cudf::test::strings_column_wrapper targets({"ut", "pellentesque"}); + cudf::test::strings_column_wrapper repls({"___", "é"}); + + auto expected = cudf::test::strings_column_wrapper{ + "é ___ euismod semo phaselus tristiut libero ___ dui congusem non é nunc ", + "é ___ euismod se phaselus tristiut libero ___ dui congusem non é ", + "é ___ euismod phaselus tristiut libero ___ dui congusem non é nun ", + "é ___ euismod seem phaselus tristiut libero ___ dui congusem non é un "}; + + auto results = nvtext::replace_tokens(cudf::strings_column_view(input), + cudf::strings_column_view(targets), + cudf::strings_column_view(repls)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(TextReplaceTest, ReplaceTokensErrorTest) { auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); From 3b98f8100adaca742c00a075bed83175d43b7f26 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 29 May 2024 09:24:49 -0700 Subject: [PATCH 013/340] Refactor join benchmarks to target public APIs with the default stream (#15873) This a followup of #15644. It fixes the lhs/rhs input bug in the hash join and distinct join benchmarks. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Karthikeyan (https://github.com/karthikeyann) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15873 --- cpp/benchmarks/join/distinct_join.cu | 22 ++++++++++---------- cpp/benchmarks/join/join.cu | 30 ++++++---------------------- cpp/benchmarks/join/join_common.hpp | 9 +++------ cpp/benchmarks/join/mixed_join.cu | 15 +++++--------- 4 files changed, 24 insertions(+), 52 deletions(-) diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu index af8fa1f9d94..3502cbcea2a 100644 --- a/cpp/benchmarks/join/distinct_join.cu +++ b/cpp/benchmarks/join/distinct_join.cu @@ -20,17 +20,16 @@ template void distinct_inner_join(nvbench::state& state, nvbench::type_list>) { - auto join = [](cudf::table_view const& build_input, - cudf::table_view const& probe_input, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) { + auto join = [](cudf::table_view const& probe_input, + cudf::table_view const& build_input, + cudf::null_equality compare_nulls) { auto const has_nulls = cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input) ? cudf::nullable_join::YES : cudf::nullable_join::NO; auto hj_obj = cudf::distinct_hash_join{ - build_input, probe_input, has_nulls, compare_nulls, stream}; - return hj_obj.inner_join(stream); + build_input, probe_input, has_nulls, compare_nulls}; + return hj_obj.inner_join(); }; BM_join(state, join); @@ -40,17 +39,16 @@ template void distinct_left_join(nvbench::state& state, nvbench::type_list>) { - auto join = [](cudf::table_view const& build_input, - cudf::table_view const& probe_input, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) { + auto join = [](cudf::table_view const& probe_input, + cudf::table_view const& build_input, + cudf::null_equality compare_nulls) { auto const has_nulls = cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input) ? cudf::nullable_join::YES : cudf::nullable_join::NO; auto hj_obj = cudf::distinct_hash_join{ - build_input, probe_input, has_nulls, compare_nulls, stream}; - return hj_obj.left_join(stream); + build_input, probe_input, has_nulls, compare_nulls}; + return hj_obj.left_join(); }; BM_join(state, join); diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu index c4a39da4662..942fb823ddc 100644 --- a/cpp/benchmarks/join/join.cu +++ b/cpp/benchmarks/join/join.cu @@ -22,15 +22,9 @@ void nvbench_inner_join(nvbench::state& state, { auto join = [](cudf::table_view const& left_input, cudf::table_view const& right_input, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) { - auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream); - return hj_obj.inner_join(right_input, std::nullopt, stream); + cudf::null_equality compare_nulls) { + return cudf::inner_join(left_input, right_input, compare_nulls); }; - BM_join(state, join); } @@ -39,15 +33,9 @@ void nvbench_left_join(nvbench::state& state, nvbench::type_list(state, join); } @@ -56,15 +44,9 @@ void nvbench_full_join(nvbench::state& state, nvbench::type_list(state, join); } diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp index 9e23d28b363..e6792b9dbfb 100644 --- a/cpp/benchmarks/join/join_common.hpp +++ b/cpp/benchmarks/join/join_common.hpp @@ -178,6 +178,7 @@ void BM_join(state_type& state, Join JoinFunc) } } if constexpr (std::is_same_v and (join_type != join_t::CONDITIONAL)) { + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); if constexpr (join_type == join_t::MIXED) { auto const col_ref_left_0 = cudf::ast::column_reference(0); auto const col_ref_right_0 = @@ -185,23 +186,19 @@ void BM_join(state_type& state, Join JoinFunc) auto left_zero_eq_right_zero = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - rmm::cuda_stream_view stream_view{launch.get_stream()}; auto result = JoinFunc(left_table.select(columns_to_join), right_table.select(columns_to_join), left_table.select({1}), right_table.select({1}), left_zero_eq_right_zero, - cudf::null_equality::UNEQUAL, - stream_view); + cudf::null_equality::UNEQUAL); }); } if constexpr (join_type == join_t::HASH) { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - rmm::cuda_stream_view stream_view{launch.get_stream()}; auto result = JoinFunc(left_table.select(columns_to_join), right_table.select(columns_to_join), - cudf::null_equality::UNEQUAL, - stream_view); + cudf::null_equality::UNEQUAL); }); } } diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu index 129ea62e7a6..0345d1e93fa 100644 --- a/cpp/benchmarks/join/mixed_join.cu +++ b/cpp/benchmarks/join/mixed_join.cu @@ -25,8 +25,7 @@ void nvbench_mixed_inner_join(nvbench::state& state, cudf::table_view const& left_conditional_input, cudf::table_view const& right_conditional_input, cudf::ast::operation binary_pred, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) { + cudf::null_equality compare_nulls) { return cudf::mixed_inner_join(left_equality_input, right_equality_input, left_conditional_input, @@ -47,8 +46,7 @@ void nvbench_mixed_left_join(nvbench::state& state, cudf::table_view const& left_conditional_input, cudf::table_view const& right_conditional_input, cudf::ast::operation binary_pred, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) { + cudf::null_equality compare_nulls) { return cudf::mixed_left_join(left_equality_input, right_equality_input, left_conditional_input, @@ -69,8 +67,7 @@ void nvbench_mixed_full_join(nvbench::state& state, cudf::table_view const& left_conditional_input, cudf::table_view const& right_conditional_input, cudf::ast::operation binary_pred, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) { + cudf::null_equality compare_nulls) { return cudf::mixed_full_join(left_equality_input, right_equality_input, left_conditional_input, @@ -91,8 +88,7 @@ void nvbench_mixed_left_semi_join(nvbench::state& state, cudf::table_view const& left_conditional_input, cudf::table_view const& right_conditional_input, cudf::ast::operation binary_pred, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) { + cudf::null_equality compare_nulls) { return cudf::mixed_left_semi_join(left_equality_input, right_equality_input, left_conditional_input, @@ -113,8 +109,7 @@ void nvbench_mixed_left_anti_join(nvbench::state& state, cudf::table_view const& left_conditional_input, cudf::table_view const& right_conditional_input, cudf::ast::operation binary_pred, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) { + cudf::null_equality compare_nulls) { return cudf::mixed_left_anti_join(left_equality_input, right_equality_input, left_conditional_input, From afd5522b31c522bab2f093f620e600e79662c433 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 29 May 2024 12:03:02 -0500 Subject: [PATCH 014/340] add unit test setup for cudf_kafka (#15853) Fixes #15841 Proposes adding a basic unit test setup for `cudf_kafka`. Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15853 --- ci/run_cudf_kafka_pytests.sh | 9 +++++++++ ci/run_custreamz_pytests.sh | 2 +- ci/run_dask_cudf_pytests.sh | 2 +- ci/test_python_other.sh | 4 ++++ python/cudf_kafka/cudf_kafka/tests/__init__.py | 0 python/cudf_kafka/cudf_kafka/tests/test_version.py | 12 ++++++++++++ python/cudf_kafka/pyproject.toml | 5 +++++ 7 files changed, 32 insertions(+), 2 deletions(-) create mode 100755 ci/run_cudf_kafka_pytests.sh create mode 100644 python/cudf_kafka/cudf_kafka/tests/__init__.py create mode 100644 python/cudf_kafka/cudf_kafka/tests/test_version.py diff --git a/ci/run_cudf_kafka_pytests.sh b/ci/run_cudf_kafka_pytests.sh new file mode 100755 index 00000000000..de227c84872 --- /dev/null +++ b/ci/run_cudf_kafka_pytests.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# Support invoking run_cudf_kafka_pytests.sh outside the script directory +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_kafka/cudf_kafka + +pytest --cache-clear "$@" tests diff --git a/ci/run_custreamz_pytests.sh b/ci/run_custreamz_pytests.sh index 53e27ec64b3..67b152fc187 100755 --- a/ci/run_custreamz_pytests.sh +++ b/ci/run_custreamz_pytests.sh @@ -3,7 +3,7 @@ set -euo pipefail -# It is essential to cd into python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level. +# It is essential to cd into python/custreamz/custreamz/ as `pytest-xdist` + `coverage` seem to work only at this directory level. # Support invoking run_custreamz_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/custreamz/custreamz/ diff --git a/ci/run_dask_cudf_pytests.sh b/ci/run_dask_cudf_pytests.sh index 07658c6d234..37aadb5fee9 100755 --- a/ci/run_dask_cudf_pytests.sh +++ b/ci/run_dask_cudf_pytests.sh @@ -3,7 +3,7 @@ set -euo pipefail -# It is essential to cd into python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level. +# It is essential to cd into python/dask_cudf/dask_cudf/ as `pytest-xdist` + `coverage` seem to work only at this directory level. # Support invoking run_dask_cudf_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/dask_cudf/dask_cudf/ diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index cbc1dc1cb87..06a24773cae 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -36,6 +36,10 @@ DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ --dist=loadscope \ . +rapids-logger "pytest cudf_kafka" +./ci/run_cudf_kafka_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml" + rapids-logger "pytest custreamz" ./ci/run_custreamz_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \ diff --git a/python/cudf_kafka/cudf_kafka/tests/__init__.py b/python/cudf_kafka/cudf_kafka/tests/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf_kafka/cudf_kafka/tests/test_version.py b/python/cudf_kafka/cudf_kafka/tests/test_version.py new file mode 100644 index 00000000000..2dc2846c4cf --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/tests/test_version.py @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import cudf_kafka + + +def test_version_constants_are_populated(): + # __git_commit__ will only be non-empty in a built distribution + assert isinstance(cudf_kafka.__git_commit__, str) + + # __version__ should always be non-empty + assert isinstance(cudf_kafka.__version__, str) + assert len(cudf_kafka.__version__) > 0 diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index d34a1260422..9233d0e92dd 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -82,6 +82,11 @@ skip = [ "__init__.py", ] +[tool.pytest.ini_options] +filterwarnings = [ + "error" +] + [tool.scikit-build] build-dir = "build/{wheel_tag}" cmake.build-type = "Release" From 7b02f4b0b5adcc30db106a0b63f7273c9dff1984 Mon Sep 17 00:00:00 2001 From: Ray Bell Date: Wed, 29 May 2024 13:24:24 -0400 Subject: [PATCH 015/340] DOC: add linkcode to docs (#15860) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds a [source] button in the API docs which allows readers to jump into the code behind the API docs. This is currently done in pandas e.g. https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html#pandas.DataFrame and below. The code is also copied and modified from the pandas repo (https://github.com/pandas-dev/pandas/blob/main/doc/source/conf.py#L637). ![Screenshot 2024-05-24 at 3 57 57 PM](https://github.com/rapidsai/cudf/assets/17162724/0bc04c1b-25c3-4d0f-a777-5e3fc42d0ce1) Authors: - Ray Bell (https://github.com/raybellwaves) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15860 --- docs/cudf/source/conf.py | 61 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index bcefa3fbdf8..73d8b4445d3 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -19,10 +19,12 @@ import datetime import filecmp import glob +import inspect import os import re import sys import tempfile +import warnings import xml.etree.ElementTree as ET from docutils.nodes import Text @@ -69,6 +71,7 @@ class PseudoLexer(RegexLexer): "sphinx.ext.autosummary", "sphinx_copybutton", "sphinx_remove_toctrees", + "sphinx.ext.linkcode", "numpydoc", "IPython.sphinxext.ipython_console_highlighting", "IPython.sphinxext.ipython_directive", @@ -557,6 +560,64 @@ def on_missing_reference(app, env, node, contnode): ] +# Needed for the [source] button on the API docs to link to the github code +# based on pandas doc/source/conf.py +def linkcode_resolve(domain, info) -> str | None: + """ + Determine the URL corresponding to Python object + """ + if domain != "py": + return None + + modname = info["module"] + fullname = info["fullname"] + + submod = sys.modules.get(modname) + if submod is None: + return None + + obj = submod + for part in fullname.split("."): + try: + with warnings.catch_warnings(): + # Accessing deprecated objects will generate noisy warnings + warnings.simplefilter("ignore", FutureWarning) + obj = getattr(obj, part) + except AttributeError: + return None + + try: + fn = inspect.getsourcefile(inspect.unwrap(obj)) + except TypeError: + try: # property + fn = inspect.getsourcefile(inspect.unwrap(obj.fget)) + except (AttributeError, TypeError): + fn = None + if not fn: + return None + + try: + source, lineno = inspect.getsourcelines(obj) + except TypeError: + try: # property + source, lineno = inspect.getsourcelines(obj.fget) + except (AttributeError, TypeError): + lineno = None + except OSError: + lineno = None + + if lineno: + linespec = f"#L{lineno}-L{lineno + len(source) - 1}" + else: + linespec = "" + + fn = os.path.relpath(fn, start=os.path.dirname(cudf.__file__)) + return ( + f"https://github.com/rapidsai/cudf/blob/" + f"branch-{version}/python/cudf/cudf/{fn}{linespec}" + ) + + def setup(app): app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") app.add_js_file( From eafa570c24a2130292894dd91b68e57edfcbcc96 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 29 May 2024 14:46:54 -0400 Subject: [PATCH 016/340] Add `from_arrow_host` functions for cudf interop with nanoarrow (#15645) Following up from #15458 and continuing the work to address #14926 adding host memory version of `from_arrow_device` which will perform the copies from host memory to create cudf objects. Authors: - Matt Topol (https://github.com/zeroshade) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15645 --- cpp/CMakeLists.txt | 3 +- cpp/include/cudf/interop.hpp | 91 ++- cpp/src/interop/arrow_utilities.cpp | 90 +++ cpp/src/interop/arrow_utilities.hpp | 21 + cpp/src/interop/from_arrow_device.cu | 109 ++-- cpp/src/interop/from_arrow_host.cu | 492 +++++++++++++++ cpp/src/interop/to_arrow_device.cu | 1 - cpp/src/interop/to_arrow_schema.cpp | 2 +- cpp/src/interop/to_arrow_utilities.cpp | 44 -- cpp/src/interop/to_arrow_utilities.hpp | 34 -- cpp/tests/CMakeLists.txt | 1 + cpp/tests/interop/from_arrow_device_test.cpp | 12 +- cpp/tests/interop/from_arrow_host_test.cpp | 612 +++++++++++++++++++ cpp/tests/interop/nanoarrow_utils.hpp | 236 +++++++ cpp/tests/interop/to_arrow_device_test.cpp | 107 ++-- 15 files changed, 1631 insertions(+), 224 deletions(-) create mode 100644 cpp/src/interop/arrow_utilities.cpp create mode 100644 cpp/src/interop/from_arrow_host.cu delete mode 100644 cpp/src/interop/to_arrow_utilities.cpp delete mode 100644 cpp/src/interop/to_arrow_utilities.hpp create mode 100644 cpp/tests/interop/from_arrow_host_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f69f04f9c10..f637db66c2c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -360,11 +360,12 @@ add_library( src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/from_arrow.cu + src/interop/arrow_utilities.cpp src/interop/to_arrow.cu src/interop/to_arrow_device.cu src/interop/from_arrow_device.cu + src/interop/from_arrow_host.cu src/interop/to_arrow_schema.cpp - src/interop/to_arrow_utilities.cpp src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp src/io/avro/avro_gpu.cu diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index bb05a622f40..f3ff0009d5c 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -46,6 +46,8 @@ struct ArrowDeviceArray; struct ArrowSchema; +struct ArrowArray; + namespace cudf { /** * @addtogroup interop_dlpack @@ -348,6 +350,91 @@ std::unique_ptr from_arrow( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * @throws cudf::data_type_error if the input array is not a struct array. + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowArray` pointer that needs to be converted to cudf::table + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate `cudf::table` + * @return cudf table generated from given arrow data + */ +std::unique_ptr from_arrow(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowArray` pointer that needs to be converted to cudf::column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate `cudf::column` + * @return cudf column generated from given arrow data + */ +std::unique_ptr from_arrow_column(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Create `cudf::table` from given ArrowDeviceArray input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU` + * + * @throws cudf::data_type_error if the input array is not a struct array, + * non-struct arrays should be passed to `from_arrow_host_column` instead. + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowDeviceArray` pointer to object owning the Arrow data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf table generated from the given Arrow data + */ +std::unique_ptr
from_arrow_host( + ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create `cudf::column` from given ArrowDeviceArray input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU` + * + * @throws cudf::data_type_error if input arrow data type is not supported in cudf. + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowDeviceArray` pointer to object owning the Arrow data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf column generated from the given Arrow data + */ +std::unique_ptr from_arrow_host_column( + ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray * @@ -398,7 +485,7 @@ using unique_table_view_t = * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not * accessed after this happens. * - * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` + * @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` * or `ARROW_DEVICE_CUDA_MANAGED` * * @throws cudf::data_type_error if the input array is not a struct array, non-struct @@ -446,7 +533,7 @@ using unique_column_view_t = * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not * accessed after this happens. * - * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` + * @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` * or `ARROW_DEVICE_CUDA_MANAGED` * * @throws cudf::data_type_error input arrow data type is not supported. diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp new file mode 100644 index 00000000000..05beecfbf9b --- /dev/null +++ b/cpp/src/interop/arrow_utilities.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include + +#include + +namespace cudf { +namespace detail { +data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) +{ + switch (arrow_view->type) { + case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY); + case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8); + case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); + case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); + case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); + case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); + case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); + case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); + case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); + case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); + case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); + case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); + case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); + case NANOARROW_TYPE_TIMESTAMP: { + switch (arrow_view->time_unit) { + case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS); + case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS); + case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS); + case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS); + default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error); + } + } + case NANOARROW_TYPE_DURATION: { + switch (arrow_view->time_unit) { + case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS); + case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS); + case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS); + case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS); + default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error); + } + } + case NANOARROW_TYPE_DECIMAL128: + return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale}; + default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error); + } +} + +ArrowType id_to_arrow_type(cudf::type_id id) +{ + switch (id) { + case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL; + case cudf::type_id::INT8: return NANOARROW_TYPE_INT8; + case cudf::type_id::INT16: return NANOARROW_TYPE_INT16; + case cudf::type_id::INT32: return NANOARROW_TYPE_INT32; + case cudf::type_id::INT64: return NANOARROW_TYPE_INT64; + case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8; + case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16; + case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32; + case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64; + case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT; + case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE; + case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32; + default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error); + } +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 9bbdaa2c363..defddb4dc42 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -16,6 +16,11 @@ #pragma once +#include + +#include +#include + namespace cudf { namespace detail { @@ -26,5 +31,21 @@ namespace detail { static constexpr int validity_buffer_idx = 0; static constexpr int fixed_width_data_buffer_idx = 1; +/** + * @brief Map ArrowType id to cudf column type id + * + * @param arrow_view SchemaView to pull the logical and storage types from + * @return Column type id + */ +data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view); + +/** + * @brief Map cudf column type id to ArrowType id + * + * @param id Column type id + * @return ArrowType id + */ +ArrowType id_to_arrow_type(cudf::type_id id); + } // namespace detail } // namespace cudf diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index d4d31d1989b..002a8ec1f14 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -42,49 +42,6 @@ namespace cudf { namespace detail { -data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) -{ - switch (arrow_view->type) { - case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY); - case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8); - case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); - case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); - case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); - case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); - case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); - case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); - case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); - case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); - case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); - case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); - case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); - case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); - case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); - case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); - case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); - case NANOARROW_TYPE_TIMESTAMP: { - switch (arrow_view->time_unit) { - case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS); - case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS); - case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS); - case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS); - default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error); - } - } - case NANOARROW_TYPE_DURATION: { - switch (arrow_view->time_unit) { - case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS); - case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS); - case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS); - case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS); - default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error); - } - } - case NANOARROW_TYPE_DECIMAL128: - return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale}; - default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error); - } -} namespace { @@ -379,11 +336,25 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, } // namespace -unique_table_view_t from_arrow_device(ArrowSchemaView* schema, +unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || + input->device_type == ARROW_DEVICE_CUDA_HOST || + input->device_type == ARROW_DEVICE_CUDA_MANAGED, + "ArrowDeviceArray memory must be accessible to CUDA", + std::invalid_argument); + + rmm::cuda_set_device_raii dev( + rmm::cuda_device_id{static_cast(input->device_id)}); + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + if (input->sync_event != nullptr) { CUDF_CUDA_TRY( cudaStreamWaitEvent(stream.value(), *reinterpret_cast(input->sync_event))); @@ -392,14 +363,14 @@ unique_table_view_t from_arrow_device(ArrowSchemaView* schema, std::vector columns; owned_columns_t owned_mem; - auto type = arrow_to_cudf_type(schema); + auto type = arrow_to_cudf_type(&view); CUDF_EXPECTS(type == data_type(type_id::STRUCT), "Must pass a struct to `from_arrow_device`", cudf::data_type_error); std::transform( input->array.children, input->array.children + input->array.n_children, - schema->schema->children, + view.schema->children, std::back_inserter(columns), [&owned_mem, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { ArrowSchemaView view; @@ -420,18 +391,32 @@ unique_table_view_t from_arrow_device(ArrowSchemaView* schema, custom_view_deleter{std::move(owned_mem)}}; } -unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema, +unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || + input->device_type == ARROW_DEVICE_CUDA_HOST || + input->device_type == ARROW_DEVICE_CUDA_MANAGED, + "ArrowDeviceArray must be accessible to CUDA", + std::invalid_argument); + + rmm::cuda_set_device_raii dev( + rmm::cuda_device_id{static_cast(input->device_id)}); + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + if (input->sync_event != nullptr) { CUDF_CUDA_TRY( cudaStreamWaitEvent(stream.value(), *reinterpret_cast(input->sync_event))); } - auto type = arrow_to_cudf_type(schema); - auto [colview, owned] = get_column(schema, &input->array, type, false, stream, mr); + auto type = arrow_to_cudf_type(&view); + auto [colview, owned] = get_column(&view, &input->array, type, false, stream, mr); return unique_column_view_t{new column_view{colview}, custom_view_deleter{std::move(owned)}}; } @@ -443,20 +428,9 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || - input->device_type == ARROW_DEVICE_CUDA_HOST || - input->device_type == ARROW_DEVICE_CUDA_MANAGED, - "ArrowDeviceArray memory must be accessible to CUDA"); - CUDF_FUNC_RANGE(); - rmm::cuda_set_device_raii dev( - rmm::cuda_device_id{static_cast(input->device_id)}); - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_device(&view, input, stream, mr); + return detail::from_arrow_device(schema, input, stream, mr); } unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, @@ -464,20 +438,9 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || - input->device_type == ARROW_DEVICE_CUDA_HOST || - input->device_type == ARROW_DEVICE_CUDA_MANAGED, - "ArrowDeviceArray must be accessible to CUDA"); - CUDF_FUNC_RANGE(); - rmm::cuda_set_device_raii dev( - rmm::cuda_device_id{static_cast(input->device_id)}); - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_device_column(&view, input, stream, mr); + return detail::from_arrow_device_column(schema, input, stream, mr); } } // namespace cudf diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu new file mode 100644 index 00000000000..36bb35d9419 --- /dev/null +++ b/cpp/src/interop/from_arrow_host.cu @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace detail { + +namespace { + +struct dispatch_copy_from_arrow_host { + rmm::cuda_stream_view stream; + rmm::mr::device_memory_resource* mr; + + std::unique_ptr get_mask_buffer(ArrowArray const* array) + { + auto* bitmap = array->buffers[validity_buffer_idx]; + if (bitmap == nullptr) { return std::make_unique(0, stream, mr); } + + auto const bitmask_size = array->length + array->offset; + auto const allocation_size = + bitmask_allocation_size_bytes(static_cast(bitmask_size)); + auto mask = std::make_unique(allocation_size, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(), + reinterpret_cast(bitmap), + allocation_size, + cudaMemcpyDefault, + stream.value())); + return mask; + } + + template () && + !std::is_same_v)> + std::unique_ptr operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool) + { + CUDF_FAIL("Unsupported type in copy_from_arrow_host."); + } + + template () || std::is_same_v)> + std::unique_ptr operator()(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask) + { + using DeviceType = std::conditional_t, __int128_t, T>; + + size_type const num_rows = input->length; + size_type const offset = input->offset; + size_type const null_count = input->null_count; + auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + + auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; + auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); + auto mutable_column_view = col->mutable_view(); + CUDF_CUDA_TRY( + cudaMemcpyAsync(mutable_column_view.data(), + reinterpret_cast(data_buffer) + offset * sizeof(DeviceType), + sizeof(DeviceType) * num_rows, + cudaMemcpyDefault, + stream.value())); + + if (has_nulls) { + auto tmp_mask = get_mask_buffer(input); + + // if array is sliced, we have to copy the whole mask and then take copy + auto out_mask = + (offset == 0) + ? std::move(*tmp_mask) + : cudf::detail::copy_bitmask( + static_cast(tmp_mask->data()), offset, offset + num_rows, stream, mr); + + col->set_null_mask(std::move(out_mask), null_count); + } + + return col; + } +}; + +// forward declaration is needed because `type_dispatch` instantiates the +// dispatch_copy_from_arrow_host struct causing a recursive situation for struct, +// dictionary and list_view types. +// +// This function is simply a convenience wrapper around the dispatch functor with +// some extra handling to avoid having to reproduce it for all of the nested types. +// It also allows us to centralize the location where the recursive calls happen +// so that we only need to forward declare this one function, rather than multiple +// functions which handle the overloads for nested types (list, struct, etc.) +std::unique_ptr get_column_copy(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask) +{ + auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); + + auto data = rmm::device_buffer(buffer_length, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), + reinterpret_cast(data_buffer), + buffer_length, + cudaMemcpyDefault, + stream.value())); + auto out_col = mask_to_bools(static_cast(data.data()), + input->offset, + input->offset + input->length, + stream, + mr); + + auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; + if (has_nulls) { + auto out_mask = detail::copy_bitmask(static_cast(get_mask_buffer(input)->data()), + input->offset, + input->offset + input->length, + stream, + mr); + + out_col->set_null_mask(std::move(out_mask), input->null_count); + } + + return out_col; +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + if (input->length == 0) { return make_empty_column(type_id::STRING); } + + // offsets column should contain no nulls so we can put nullptr for the bitmask + // nulls are tracked in the parent string column itself, not in the offsets + void const* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + ArrowArray offsets_array = { + .length = input->offset + input->length + 1, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = offset_buffers, + }; + + // chars_column does not contain any nulls, they are tracked by the parent string column + // itself instead. So we pass nullptr for the validity bitmask. + size_type const char_data_length = + reinterpret_cast(offset_buffers[1])[input->length + input->offset]; + void const* char_buffers[2] = {nullptr, input->buffers[2]}; + ArrowArray char_array = { + .length = char_data_length, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = char_buffers, + }; + + nanoarrow::UniqueSchema offset_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); + + nanoarrow::UniqueSchema char_data_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8)); + + // leverage the dispatch overloads for int32 and char(int8) to generate the child + // offset and char data columns for us. + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); + auto offsets_column = + this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); + auto chars_column = this->operator()(&view, &char_array, data_type(type_id::INT8), true); + + auto const num_rows = offsets_column->size() - 1; + auto out_col = make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + input->null_count, + std::move(*get_mask_buffer(input))); + + return input->offset == 0 + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + ArrowSchemaView keys_schema_view; + NANOARROW_THROW_NOT_OK( + ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr)); + + auto const keys_type = arrow_to_cudf_type(&keys_schema_view); + auto keys_column = + get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr); + + auto const dict_indices_type = [&schema]() -> data_type { + // cudf dictionary requires an unsigned type for the indices, + // since it is invalid for an arrow dictionary to contain negative + // indices, we can safely use the unsigned equivalent without having + // to modify the buffers. + switch (schema->storage_type) { + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); + } + }(); + + auto indices_column = get_column_copy(schema, input, dict_indices_type, false, stream, mr); + // child columns shouldn't have masks and we need the mask in the main column + auto column_contents = indices_column->release(); + indices_column = std::make_unique(dict_indices_type, + static_cast(input->length), + std::move(*(column_contents.data)), + rmm::device_buffer{}, + 0); + + return make_dictionary_column(std::move(keys_column), + std::move(indices_column), + std::move(*(column_contents.null_mask)), + input->null_count); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + std::vector> child_columns; + std::transform( + input->children, + input->children + input->n_children, + schema->schema->children, + std::back_inserter(child_columns), + [this, input](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + + auto out = get_column_copy(&view, child, type, false, stream, mr); + return input->offset == 0 && input->length == out->size() + ? std::move(out) + : std::make_unique( + cudf::detail::slice(out->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); + }); + + auto out_mask = std::move(*(get_mask_buffer(input))); + if (input->buffers[validity_buffer_idx] != nullptr) { + out_mask = detail::copy_bitmask(static_cast(out_mask.data()), + input->offset, + input->offset + input->length, + stream, + mr); + } + + return make_structs_column( + input->length, std::move(child_columns), input->null_count, std::move(out_mask), stream, mr); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + ArrowArray offsets_array = { + .length = input->offset + input->length + 1, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = offset_buffers, + }; + nanoarrow::UniqueSchema offset_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); + auto offsets_column = + this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema->schema->children[0], nullptr)); + auto child_type = arrow_to_cudf_type(&view); + auto child_column = get_column_copy(&view, input->children[0], child_type, false, stream, mr); + + auto const num_rows = offsets_column->size() - 1; + auto out_col = make_lists_column(num_rows, + std::move(offsets_column), + std::move(child_column), + input->null_count, + std::move(*get_mask_buffer(input)), + stream, + mr); + + return num_rows == input->length + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); +} + +std::unique_ptr get_column_copy(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type.id() != type_id::EMPTY + ? std::move(type_dispatcher( + type, dispatch_copy_from_arrow_host{stream, mr}, schema, input, type, skip_mask)) + : std::make_unique(data_type(type_id::EMPTY), + input->length, + rmm::device_buffer{}, + rmm::device_buffer{}, + input->length); +} + +} // namespace + +std::unique_ptr
from_arrow_host(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_host`", + std::invalid_argument); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + + std::vector> columns; + + auto type = arrow_to_cudf_type(&view); + CUDF_EXPECTS(type == data_type(type_id::STRUCT), + "Must pass a struct to `from_arrow_host`", + cudf::data_type_error); + + std::transform(input->array.children, + input->array.children + input->array.n_children, + view.schema->children, + std::back_inserter(columns), + [&stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + return get_column_copy(&view, child, type, false, stream, mr); + }); + + return std::make_unique
(std::move(columns)); +} + +std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`", + std::invalid_argument); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + + auto type = arrow_to_cudf_type(&view); + return get_column_copy(&view, &input->array, type, false, stream, mr); +} + +} // namespace detail + +std::unique_ptr
from_arrow_host(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + return detail::from_arrow_host(schema, input, stream, mr); +} + +std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + return detail::from_arrow_host_column(schema, input, stream, mr); +} + +std::unique_ptr
from_arrow(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + ArrowDeviceArray const device_input = { + .array = *input, + .device_id = -1, + .device_type = ARROW_DEVICE_CPU, + }; + return detail::from_arrow_host(schema, &device_input, stream, mr); +} + +std::unique_ptr from_arrow_column(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + ArrowDeviceArray const device_input = { + .array = *input, + .device_id = -1, + .device_type = ARROW_DEVICE_CPU, + }; + return detail::from_arrow_host_column(schema, &device_input, stream, mr); +} + +} // namespace cudf diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index f2b1669df9b..ebfd6605977 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -15,7 +15,6 @@ */ #include "arrow_utilities.hpp" -#include "to_arrow_utilities.hpp" #include #include diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp index 6f943593dce..19915464236 100644 --- a/cpp/src/interop/to_arrow_schema.cpp +++ b/cpp/src/interop/to_arrow_schema.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "to_arrow_utilities.hpp" +#include "arrow_utilities.hpp" #include #include diff --git a/cpp/src/interop/to_arrow_utilities.cpp b/cpp/src/interop/to_arrow_utilities.cpp deleted file mode 100644 index 04d17847273..00000000000 --- a/cpp/src/interop/to_arrow_utilities.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "to_arrow_utilities.hpp" - -#include - -namespace cudf { -namespace detail { - -ArrowType id_to_arrow_type(cudf::type_id id) -{ - switch (id) { - case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL; - case cudf::type_id::INT8: return NANOARROW_TYPE_INT8; - case cudf::type_id::INT16: return NANOARROW_TYPE_INT16; - case cudf::type_id::INT32: return NANOARROW_TYPE_INT32; - case cudf::type_id::INT64: return NANOARROW_TYPE_INT64; - case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8; - case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16; - case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32; - case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64; - case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT; - case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE; - case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32; - default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error); - } -} - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/interop/to_arrow_utilities.hpp b/cpp/src/interop/to_arrow_utilities.hpp deleted file mode 100644 index 3c01c726a7b..00000000000 --- a/cpp/src/interop/to_arrow_utilities.hpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -namespace cudf { -namespace detail { - -/** - * @brief Map cudf column type id to ArrowType id - * - * @param id Column type id - * @return ArrowType id - */ -ArrowType id_to_arrow_type(cudf::type_id id); - -} // namespace detail -} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 42b7f089d61..c6ab8aa021a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -269,6 +269,7 @@ ConfigureTest( interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/from_arrow_device_test.cpp + interop/from_arrow_host_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index 66bd4dd1bfb..d776ca57ef6 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -49,23 +49,23 @@ TYPED_TEST_SUITE(FromArrowDeviceTestDurationsTest, cudf::test::DurationTypes); TEST_F(FromArrowDeviceTest, FailConditions) { // can't pass null for schema or device array - EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), std::invalid_argument); // can't pass null for device array ArrowSchema schema; - EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), std::invalid_argument); // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED // should fail with ARROW_DEVICE_CPU ArrowDeviceArray arr; arr.device_type = ARROW_DEVICE_CPU; - EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), std::invalid_argument); // can't pass null for schema or device array - EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), std::invalid_argument); // can't pass null for device array - EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), std::invalid_argument); // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED // should fail with ARROW_DEVICE_CPU - EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), std::invalid_argument); } TEST_F(FromArrowDeviceTest, EmptyTable) diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp new file mode 100644 index 00000000000..e6e52099a0c --- /dev/null +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -0,0 +1,612 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nanoarrow_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// create a cudf::table and equivalent arrow table with host memory +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_host_tables(cudf::size_type length) +{ + auto [table, schema, test_data] = get_nanoarrow_cudf_table(length); + + auto int64_array = get_nanoarrow_array(test_data.int64_data, test_data.validity); + auto string_array = + get_nanoarrow_array(test_data.string_data, test_data.validity); + cudf::dictionary_column_view view(table->get_column(2).view()); + auto keys = cudf::test::to_host(view.keys()).first; + auto indices = cudf::test::to_host(view.indices()).first; + auto dict_array = get_nanoarrow_dict_array(std::vector(keys.begin(), keys.end()), + std::vector(indices.begin(), indices.end()), + test_data.validity); + auto boolarray = get_nanoarrow_array(test_data.bool_data, test_data.bool_validity); + auto list_array = get_nanoarrow_list_array(test_data.list_int64_data, + test_data.list_offsets, + test_data.list_int64_data_validity, + test_data.bool_data_validity); + + nanoarrow::UniqueArray arrow; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); + arrow->length = length; + + int64_array.move(arrow->children[0]); + string_array.move(arrow->children[1]); + dict_array.move(arrow->children[2]); + boolarray.move(arrow->children[3]); + list_array.move(arrow->children[4]); + + int64_array = get_nanoarrow_array(test_data.int64_data, test_data.validity); + string_array = get_nanoarrow_array(test_data.string_data, test_data.validity); + int64_array.move(arrow->children[5]->children[0]); + string_array.move(arrow->children[5]->children[1]); + + ArrowBitmap struct_validity; + ArrowBitmapInit(&struct_validity); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&struct_validity, length)); + ArrowBitmapAppendInt8Unsafe( + &struct_validity, reinterpret_cast(test_data.bool_data_validity.data()), length); + arrow->children[5]->length = length; + ArrowArraySetValidityBitmap(arrow->children[5], &struct_validity); + arrow->children[5]->null_count = + length - ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length); + + ArrowError error; + if (ArrowArrayFinishBuilding(arrow.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, &error) != + NANOARROW_OK) { + std::cerr << ArrowErrorMessage(&error) << std::endl; + CUDF_FAIL("failed to build example arrays"); + } + + return std::make_tuple(std::move(table), std::move(schema), std::move(arrow)); +} + +struct FromArrowHostDeviceTest : public cudf::test::BaseFixture {}; + +template +struct FromArrowHostDeviceTestDurationsTest : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(FromArrowHostDeviceTestDurationsTest, cudf::test::DurationTypes); + +TEST_F(FromArrowHostDeviceTest, EmptyTable) +{ + auto [tbl, schema, arr] = get_nanoarrow_host_tables(0); + + auto expected_cudf_table = tbl->view(); + ArrowDeviceArray input; + memcpy(&input.array, arr.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view()); +} + +TEST_F(FromArrowHostDeviceTest, DateTimeTable) +{ + auto data = std::vector{1, 2, 3, 4, 5, 6}; + auto col = cudf::test::fixed_width_column_wrapper( + data.begin(), data.end()); + cudf::table_view expected_table_view({col}); + + // construct equivalent arrow schema with nanoarrow + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime( + input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + + // equivalent arrow record batch + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = 6; + input_array->null_count = 0; + + auto arr = get_nanoarrow_array(data); + arr.move(input_array->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // test that we get the same cudf table as we expect by converting the + // host arrow memory to a cudf table + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + // test that we get a cudf table with a single struct column that is equivalent + // if we use from_arrow_host_column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TYPED_TEST(FromArrowHostDeviceTestDurationsTest, DurationTable) +{ + using T = TypeParam; + if (cudf::type_to_id() == cudf::type_id::DURATION_DAYS) { return; } + + auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}}; + auto col = cudf::test::fixed_width_column_wrapper(data); + + cudf::table_view expected_table_view({col}); + const ArrowTimeUnit time_unit = [&] { + switch (cudf::type_to_id()) { + case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND; + case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI; + case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO; + case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO; + default: CUDF_FAIL("Unsupported duration unit in arrow"); + } + }(); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime( + input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table_view.num_rows(); + input_array->null_count = 0; + + auto arr = get_nanoarrow_array(data); + arr.move(input_array->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // converting arrow host memory to cudf table gives us the expected table + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + // converting to a cudf table with a single struct column gives us the expected + // result column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, NestedList) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; }); + auto col = cudf::test::lists_column_wrapper( + {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids}); + cudf::table_view expected_table_view({col}); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + input_schema->children[0]->flags = ARROW_FLAG_NULLABLE; + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element")); + input_schema->children[0]->children[0]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType( + input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element")); + input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE; + + // create the base arrow list array + auto list_arr = get_nanoarrow_list_array({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1}); + std::vector offset{0, 0, 2}; + + // populate the bitmask we're going to use for the top level list + ArrowBitmap mask; + ArrowBitmapInit(&mask); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 1)); + + nanoarrow::UniqueArray input_array; + EXPECT_EQ(NANOARROW_OK, ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table_view.num_rows(); + input_array->null_count = 0; + + ArrowArraySetValidityBitmap(input_array->children[0], &mask); + input_array->children[0]->length = expected_table_view.num_rows(); + input_array->children[0]->null_count = 1; + auto offset_buf = ArrowArrayBuffer(input_array->children[0], 1); + EXPECT_EQ( + NANOARROW_OK, + ArrowBufferAppend( + offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + + // move our base list to be the child of the one we just created + // so that we now have an equivalent value to what we created for cudf + list_arr.move(input_array->children[0]->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // converting from arrow host memory to cudf gives us the expected table + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + // converting to a single column cudf table gives us the expected struct column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, StructColumn) +{ + // Create cudf table + auto nested_type_field_names = + std::vector>{{"string", "integral", "bool", "nested_list", "struct"}}; + auto str_col = + cudf::test::strings_column_wrapper{ + "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"} + .release(); + auto str_col2 = + cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release(); + int num_rows{str_col->size()}; + auto int_col = cudf::test::fixed_width_column_wrapper{{48, 27, 25}}.release(); + auto int_col2 = + cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); + auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); + auto list_col = + cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) + .release(); + vector_of_columns cols2; + cols2.push_back(std::move(str_col2)); + cols2.push_back(std::move(int_col2)); + auto [null_mask, null_count] = + cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper{{true, true, false}}); + auto sub_struct_col = + cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask)); + vector_of_columns cols; + cols.push_back(std::move(str_col)); + cols.push_back(std::move(int_col)); + cols.push_back(std::move(bool_col)); + cols.push_back(std::move(list_col)); + cols.push_back(std::move(sub_struct_col)); + + auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {}); + cudf::table_view expected_table_view({struct_col->view()}); + + // Create name metadata + auto sub_metadata = cudf::column_metadata{"struct"}; + sub_metadata.children_meta = {{"string2"}, {"integral2"}}; + auto metadata = cudf::column_metadata{"a"}; + metadata.children_meta = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata}; + + // create the equivalent arrow schema using nanoarrow + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + input_schema->children[0]->flags = 0; + + auto child = input_schema->children[0]; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string")); + child->children[0]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral")); + child->children[1]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool")); + child->children[2]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list")); + child->children[3]->flags = 0; + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element")); + child->children[3]->children[0]->flags = 0; + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element")); + child->children[3]->children[0]->children[0]->flags = 0; + + ArrowSchemaInit(child->children[4]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct")); + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2")); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2")); + + // create nanoarrow table + // first our underlying arrays + std::vector str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}; + std::vector str2{"CUDF", "ROCKS", "EVERYWHERE"}; + auto str_array = get_nanoarrow_array(str); + auto int_array = get_nanoarrow_array({48, 27, 25}); + auto str2_array = get_nanoarrow_array(str2, {0, 1, 0}); + auto int2_array = get_nanoarrow_array({12, 24, 47}, {1, 0, 1}); + auto bool_array = get_nanoarrow_array({true, true, false}); + auto list_arr = + get_nanoarrow_list_array({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9}); + std::vector offset{0, 3, 4, 6}; + + // create the struct array + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + + input_array->length = expected_table_view.num_rows(); + + auto array_a = input_array->children[0]; + auto view_a = expected_table_view.column(0); + array_a->length = view_a.size(); + array_a->null_count = view_a.null_count(); + // populate the children of our struct by moving them from the original arrays + str_array.move(array_a->children[0]); + int_array.move(array_a->children[1]); + bool_array.move(array_a->children[2]); + + array_a->children[3]->length = expected_table_view.num_rows(); + array_a->children[3]->null_count = 0; + auto offset_buf = ArrowArrayBuffer(array_a->children[3], 1); + EXPECT_EQ( + NANOARROW_OK, + ArrowBufferAppend( + offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + + list_arr.move(array_a->children[3]->children[0]); + + // set our struct bitmap validity mask + ArrowBitmap mask; + ArrowBitmapInit(&mask); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 2)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1)); + + auto array_struct = array_a->children[4]; + auto view_struct = view_a.child(4); + ArrowArraySetValidityBitmap(array_struct, &mask); + array_struct->null_count = view_struct.null_count(); + array_struct->length = view_struct.size(); + + str2_array.move(array_struct->children[0]); + int2_array.move(array_struct->children[1]); + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // test we get the expected cudf::table from the arrow host memory data + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + // test we get the expected cudf struct column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) +{ + // test dictionary arrays with different index types + // cudf asserts that the index type must be unsigned + auto array1 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto array2 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto array3 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + + // create equivalent cudf dictionary columns + auto keys_col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 7}); + auto ind1_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind2_col = + cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind3_col = + cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + + vector_of_columns columns; + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind1_col)); + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind2_col)); + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind3_col)); + + cudf::table expected_table(std::move(columns)); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_UINT8)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_UINT16)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_UINT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64)); + + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table.num_rows(); + input_array->null_count = 0; + + array1.move(input_array->children[0]); + array2.move(input_array->children[1]); + array3.move(input_array->children[2]); + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // test we get the expected cudf table when we convert from Arrow host memory + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table->view()); + + // test we get the expected cudf::column as a struct column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end) +{ + auto op = [&](ArrowArray* array) { + // slicing only needs to happen at the top level of an array + array->offset = start; + array->length = end - start; + if (array->null_count != 0) { + array->null_count = + array->length - + ArrowBitCountSet(ArrowArrayValidityBitmap(array)->buffer.data, start, end - start); + } + }; + + if (arr->n_children == 0) { + op(arr); + return; + } + + // since we want to simulate a sliced table where the children are sliced, + // we slice each individual child of the record batch + arr->length = end - start; + for (int64_t i = 0; i < arr->n_children; ++i) { + op(arr->children[i]); + } +} + +struct FromArrowHostDeviceTestSlice + : public FromArrowHostDeviceTest, + public ::testing::WithParamInterface> {}; + +TEST_P(FromArrowHostDeviceTestSlice, SliceTest) +{ + auto [table, schema, array] = get_nanoarrow_host_tables(10000); + auto cudf_table_view = table->view(); + auto const [start, end] = GetParam(); + + auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0]; + auto expected_cudf_table = cudf::table{sliced_cudf_table}; + slice_host_nanoarrow(array.get(), start, end); + + ArrowDeviceArray input; + memcpy(&input.array, array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input); + if (got_cudf_table->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) { + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), + got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(got_cudf_table->view(), from_struct); + } else { + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), + got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); + } +} + +INSTANTIATE_TEST_CASE_P(FromArrowHostDeviceTest, + FromArrowHostDeviceTestSlice, + ::testing::Values(std::make_tuple(0, 10000), + std::make_tuple(2912, 2915), + std::make_tuple(100, 3000), + std::make_tuple(0, 0), + std::make_tuple(0, 3000), + std::make_tuple(10000, 10000))); diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index fb5d1060f6f..a79e6fdc49c 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -20,14 +20,61 @@ #include #include #include +#include #include #include #include #include #include +#include #include +struct generated_test_data { + generated_test_data(cudf::size_type length) + : int64_data(length), + bool_data(length), + string_data(length), + validity(length), + bool_validity(length), + list_int64_data(3 * length), + list_int64_data_validity(3 * length), + list_offsets(length + 1) + { + cudf::size_type length_of_individual_list = 3; + + std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); + std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); + auto validity_generator = []() { return rand() % 7 != 0; }; + std::generate( + list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); + std::generate( + list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { + return (n++) * length_of_individual_list; + }); + std::generate(bool_data.begin(), bool_data.end(), validity_generator); + std::generate( + string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; }); + std::generate(validity.begin(), validity.end(), validity_generator); + std::generate(bool_validity.begin(), bool_validity.end(), validity_generator); + + std::transform(bool_validity.cbegin(), + bool_validity.cend(), + std::back_inserter(bool_data_validity), + [](auto val) { return static_cast(val); }); + } + + std::vector int64_data; + std::vector bool_data; + std::vector string_data; + std::vector validity; + std::vector bool_validity; + std::vector bool_data_validity; + std::vector list_int64_data; + std::vector list_int64_data_validity; + std::vector list_offsets; +}; + // no-op allocator/deallocator to set into ArrowArray buffers that we don't // want to own their buffers. static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){ @@ -135,7 +182,196 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview) populate_from_col(arr->dictionary, dview.keys()); } +using vector_of_columns = std::vector>; + std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> get_nanoarrow_tables(cudf::size_type length = 10000); void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view); + +std::unique_ptr get_cudf_table(); + +template +struct nanoarrow_storage_type {}; + +#define DEFINE_NANOARROW_STORAGE(T, NanoType) \ + template <> \ + struct nanoarrow_storage_type { \ + static constexpr ArrowType type = NANOARROW_TYPE_##NanoType; \ + } + +DEFINE_NANOARROW_STORAGE(bool, BOOL); +DEFINE_NANOARROW_STORAGE(int64_t, INT64); +DEFINE_NANOARROW_STORAGE(uint16_t, UINT16); +DEFINE_NANOARROW_STORAGE(uint64_t, UINT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_D, INT32); +DEFINE_NANOARROW_STORAGE(cudf::duration_s, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_ms, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64); +DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); +DEFINE_NANOARROW_STORAGE(int32_t, INT32); + +#undef DEFINE_NANOARROW_STORAGE + +template +std::enable_if_t() and !std::is_same_v, nanoarrow::UniqueArray> +get_nanoarrow_array(std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), nanoarrow_storage_type::type)); + + if (!mask.empty()) { + ArrowBitmap bitmap; + ArrowBitmapInit(&bitmap); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, mask.size())); + ArrowBitmapAppendInt8Unsafe(&bitmap, reinterpret_cast(mask.data()), mask.size()); + + ArrowArraySetValidityBitmap(tmp.get(), &bitmap); + tmp->null_count = + data.size() - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); + } + + ArrowBuffer buf; + ArrowBufferInit(&buf); + NANOARROW_THROW_NOT_OK( + ArrowBufferAppend(&buf, reinterpret_cast(data.data()), sizeof(T) * data.size())); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &buf)); + + tmp->length = data.size(); + + return tmp; +} + +template +std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_array( + std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_BOOL)); + + auto to_arrow_bitmap = [](std::vector const& b) -> ArrowBitmap { + ArrowBitmap out; + ArrowBitmapInit(&out); + NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1)); + out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0); + out.size_bits = b.size(); + + for (size_t i = 0; i < b.size(); ++i) { + ArrowBitSetTo(out.buffer.data, i, static_cast(b[i])); + } + + return out; + }; + + if (!mask.empty()) { + auto validity_bitmap = to_arrow_bitmap(mask); + ArrowArraySetValidityBitmap(tmp.get(), &validity_bitmap); + tmp->null_count = + mask.size() - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); + } + + auto raw_buffer = to_arrow_bitmap(data); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &raw_buffer.buffer)); + tmp->length = data.size(); + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_array(std::initializer_list elements, + std::initializer_list validity = {}) +{ + std::vector mask(validity); + std::vector data(elements); + + return get_nanoarrow_array(data, mask); +} + +template +std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_array( + std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tmp.get())); + NANOARROW_THROW_NOT_OK(ArrowArrayReserve(tmp.get(), data.size())); + + for (size_t i = 0; i < data.size(); ++i) { + if (!mask.empty() && mask[i] == 0) { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendNull(tmp.get(), 1)); + } else { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(tmp.get(), ArrowCharView(data[i].c_str()))); + } + } + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_dict_array(std::vector const& keys, + std::vector const& ind, + std::vector const& validity = {}) +{ + auto indices_array = get_nanoarrow_array(ind, validity); + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateDictionary(indices_array.get())); + + auto keys_array = get_nanoarrow_array(keys); + keys_array.move(indices_array->dictionary); + + return indices_array; +} + +template +nanoarrow::UniqueArray get_nanoarrow_list_array(std::vector const& data, + std::vector const& offsets, + std::vector const& data_validity = {}, + std::vector const& list_validity = {}) +{ + auto data_array = get_nanoarrow_array(data, data_validity); + + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1)); + data_array.move(tmp->children[0]); + + tmp->length = offsets.size() - 1; + if (!list_validity.empty()) { + ArrowBitmap bitmap; + ArrowBitmapInit(&bitmap); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, list_validity.size())); + ArrowBitmapAppendInt8Unsafe( + &bitmap, reinterpret_cast(list_validity.data()), list_validity.size()); + + ArrowArraySetValidityBitmap(tmp.get(), &bitmap); + tmp->null_count = + tmp->length - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, list_validity.size()); + } + + ArrowBuffer buf; + ArrowBufferInit(&buf); + NANOARROW_THROW_NOT_OK(ArrowBufferAppend( + &buf, reinterpret_cast(offsets.data()), sizeof(int32_t) * offsets.size())); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &buf)); + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, + std::initializer_list offsets, + std::initializer_list data_validity = {}, + std::initializer_list list_validity = {}) +{ + std::vector data_vector(data); + std::vector offset(offsets); + std::vector data_mask(data_validity); + std::vector list_mask(list_validity); + return get_nanoarrow_list_array(data_vector, offset, data_mask, list_mask); +} + +std::tuple, nanoarrow::UniqueSchema, generated_test_data> +get_nanoarrow_cudf_table(cudf::size_type length); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 626aeb53cdd..4c73cd637a4 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -38,80 +38,55 @@ #include -using vector_of_columns = std::vector>; - -std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> -get_nanoarrow_tables(cudf::size_type length) +std::tuple, nanoarrow::UniqueSchema, generated_test_data> +get_nanoarrow_cudf_table(cudf::size_type length) { - std::vector int64_data(length); - std::vector bool_data(length); - std::vector string_data(length); - std::vector validity(length); - std::vector bool_validity(length); - std::vector bool_data_validity; - cudf::size_type length_of_individual_list = 3; - cudf::size_type length_of_list = length_of_individual_list * length; - std::vector list_int64_data(length_of_list); - std::vector list_int64_data_validity(length_of_list); - std::vector list_offsets(length + 1); + generated_test_data test_data(length); std::vector> columns; - std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); - std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); - auto validity_generator = []() { return rand() % 7 != 0; }; - std::generate( - list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); - std::generate( - list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { - return (n++) * length_of_individual_list; - }); - std::generate(bool_data.begin(), bool_data.end(), validity_generator); - std::generate( - string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; }); - std::generate(validity.begin(), validity.end(), validity_generator); - std::generate(bool_validity.begin(), bool_validity.end(), validity_generator); - - std::transform(bool_validity.cbegin(), - bool_validity.cend(), - std::back_inserter(bool_data_validity), - [](auto val) { return static_cast(val); }); - - columns.emplace_back(cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()) + columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.int64_data.begin(), + test_data.int64_data.end(), + test_data.validity.begin()) + .release()); + columns.emplace_back(cudf::test::strings_column_wrapper(test_data.string_data.begin(), + test_data.string_data.end(), + test_data.validity.begin()) .release()); - columns.emplace_back( - cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) - .release()); auto col4 = cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()); + test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); auto dict_col = cudf::dictionary::encode(col4); columns.emplace_back(std::move(cudf::dictionary::encode(col4))); - columns.emplace_back(cudf::test::fixed_width_column_wrapper( - bool_data.begin(), bool_data.end(), bool_validity.begin()) + columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), + test_data.bool_data.end(), + test_data.bool_validity.begin()) .release()); - auto list_child_column = cudf::test::fixed_width_column_wrapper( - list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin()); - auto list_offsets_column = - cudf::test::fixed_width_column_wrapper(list_offsets.begin(), list_offsets.end()); + auto list_child_column = + cudf::test::fixed_width_column_wrapper(test_data.list_int64_data.begin(), + test_data.list_int64_data.end(), + test_data.list_int64_data_validity.begin()); + auto list_offsets_column = cudf::test::fixed_width_column_wrapper( + test_data.list_offsets.begin(), test_data.list_offsets.end()); auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( - bool_data_validity.begin(), bool_data_validity.end())); + test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back(cudf::make_lists_column(length, list_offsets_column.release(), list_child_column.release(), list_nulls, std::move(*list_mask))); - auto int_column = cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()) - .release(); + auto int_column = + cudf::test::fixed_width_column_wrapper( + test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()) + .release(); auto str_column = - cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) + cudf::test::strings_column_wrapper( + test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin()) .release(); vector_of_columns cols; cols.push_back(move(int_column)); cols.push_back(move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( - bool_data_validity.begin(), bool_data_validity.end())); + test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back( cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask))); @@ -198,21 +173,30 @@ get_nanoarrow_tables(cudf::size_type length) schema->children[5]->flags = 0; } + return std::make_tuple( + std::make_unique(std::move(columns)), std::move(schema), std::move(test_data)); +} + +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_tables(cudf::size_type length) +{ + auto [table, schema, test_data] = get_nanoarrow_cudf_table(length); + nanoarrow::UniqueArray arrow; NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); arrow->length = length; - populate_from_col(arrow->children[0], columns[0]->view()); - populate_from_col(arrow->children[1], columns[1]->view()); - populate_dict_from_col(arrow->children[2], - cudf::dictionary_column_view(columns[2]->view())); + populate_from_col(arrow->children[0], table->get_column(0).view()); + populate_from_col(arrow->children[1], table->get_column(1).view()); + populate_dict_from_col( + arrow->children[2], cudf::dictionary_column_view(table->get_column(2).view())); - populate_from_col(arrow->children[3], columns[3]->view()); - cudf::lists_column_view list_view{columns[4]->view()}; + populate_from_col(arrow->children[3], table->get_column(3).view()); + cudf::lists_column_view list_view{table->get_column(4).view()}; populate_list_from_col(arrow->children[4], list_view); populate_from_col(arrow->children[4]->children[0], list_view.child()); - cudf::structs_column_view struct_view{columns[5]->view()}; + cudf::structs_column_view struct_view{table->get_column(5).view()}; populate_from_col(arrow->children[5]->children[0], struct_view.child(0)); populate_from_col(arrow->children[5]->children[1], struct_view.child(1)); arrow->children[5]->length = struct_view.size(); @@ -231,8 +215,7 @@ get_nanoarrow_tables(cudf::size_type length) CUDF_FAIL("failed to build example arrays"); } - return std::make_tuple( - std::make_unique(std::move(columns)), std::move(schema), std::move(arrow)); + return std::make_tuple(std::move(table), std::move(schema), std::move(arrow)); } // populate an ArrowArray list array from device buffers using a no-op From 12336da6ff3ae819635524127e65c0bfde0f3915 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Wed, 29 May 2024 14:47:51 -0400 Subject: [PATCH 017/340] Utilities for decimal <--> floating conversion (#15359) These are some utilities used by the upcoming decimal <--> floating conversion PR. This has been submitted separately from that PR in order to spread out the complexity for review. These functions are not called by any code in this PR. One function is used to extract the components of the floating point number. Another function is used to set a floating point's sign bit and add some additional powers of two. These are done using integer and bit operations, which is much faster than using the built-in functions and bottle-necking on the FP64 pipeline. The final function is used to count the # of significant bits in a number. Authors: - Paul Mattione (https://github.com/pmattione-nvidia) - Lawrence Mitchell (https://github.com/wence-) Approvers: - Mark Harris (https://github.com/harrism) - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/15359 --- .../cudf/fixed_point/floating_conversion.hpp | 241 ++++++++++++++++++ 1 file changed, 241 insertions(+) diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp index 492f7e75219..2c3a5c5629d 100644 --- a/cpp/include/cudf/fixed_point/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp @@ -16,8 +16,13 @@ #pragma once +#include + +#include #include +#include + namespace numeric { /** @@ -29,6 +34,242 @@ namespace numeric { namespace detail { +/** + * @brief Helper struct for getting and setting the components of a floating-point value + * + * @tparam FloatingType Type of floating-point value + */ +template )> +struct floating_converter { + // This struct assumes we're working with IEEE 754 floating-point values. + // Details on the IEEE-754 floating-point format: + // Format: https://learn.microsoft.com/en-us/cpp/build/ieee-floating-point-representation + // Float Visualizer: https://www.h-schmidt.net/FloatConverter/IEEE754.html + static_assert(cuda::std::numeric_limits::is_iec559, "Assumes IEEE 754"); + + /// Unsigned int type with same size as floating type + using IntegralType = + cuda::std::conditional_t, uint32_t, uint64_t>; + + // The high bit is the sign bit (0 for positive, 1 for negative). + /// How many bits in the floating type + static constexpr int num_floating_bits = sizeof(FloatingType) * CHAR_BIT; + /// The index of the sign bit + static constexpr int sign_bit_index = num_floating_bits - 1; + /// The mask to select the sign bit + static constexpr IntegralType sign_mask = (IntegralType(1) << sign_bit_index); + + // The low 23 / 52 bits (for float / double) are the mantissa. + // The mantissa is normalized. There is an understood 1 bit to the left of the binary point. + // The value of the mantissa is in the range [1, 2). + /// # mantissa bits (-1 for understood bit) + static constexpr int num_mantissa_bits = cuda::std::numeric_limits::digits - 1; + /// The mask for the understood bit + static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_mantissa_bits); + /// The mask to select the mantissa + static constexpr IntegralType mantissa_mask = understood_bit_mask - 1; + + // And in between are the bits used to store the biased power-of-2 exponent. + /// # exponents bits (-1 for sign bit) + static constexpr int num_exponent_bits = num_floating_bits - num_mantissa_bits - 1; + /// The mask for the exponents, unshifted + static constexpr IntegralType unshifted_exponent_mask = + (IntegralType(1) << num_exponent_bits) - 1; + /// The mask to select the exponents + static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_mantissa_bits; + + // To store positive and negative exponents as unsigned values, the stored value for + // the power-of-2 is exponent + bias. The bias is 127 for floats and 1023 for doubles. + /// 127 / 1023 for float / double + static constexpr IntegralType exponent_bias = + cuda::std::numeric_limits::max_exponent - 1; + + /** + * @brief Reinterpret the bits of a floating-point value as an integer + * + * @param floating The floating-point value to cast + * @return An integer with bits identical to the input + */ + CUDF_HOST_DEVICE inline static IntegralType bit_cast_to_integer(FloatingType floating) + { + // Convert floating to integer + IntegralType integer_rep; + memcpy(&integer_rep, &floating, sizeof(floating)); + return integer_rep; + } + + /** + * @brief Reinterpret the bits of an integer as floating-point value + * + * @param integer The integer to cast + * @return A floating-point value with bits identical to the input + */ + CUDF_HOST_DEVICE inline static FloatingType bit_cast_to_floating(IntegralType integer) + { + // Convert back to float + FloatingType floating; + memcpy(&floating, &integer, sizeof(floating)); + return floating; + } + + /** + * @brief Extracts the integral significand of a bit-casted floating-point number + * + * @param integer_rep The bit-casted floating value to extract the exponent from + * @return The integral significand, bit-shifted to a (large) whole number + */ + CUDF_HOST_DEVICE inline static IntegralType get_base2_value(IntegralType integer_rep) + { + // Extract the significand, setting the high bit for the understood 1/2 + return (integer_rep & mantissa_mask) | understood_bit_mask; + } + + /** + * @brief Extracts the sign bit of a bit-casted floating-point number + * + * @param integer_rep The bit-casted floating value to extract the exponent from + * @return The sign bit + */ + CUDF_HOST_DEVICE inline static bool get_is_negative(IntegralType integer_rep) + { + // Extract the sign bit: + return static_cast(sign_mask & integer_rep); + } + + /** + * @brief Extracts the exponent of a bit-casted floating-point number + * + * @note This returns INT_MIN for +/-0, +/-inf, NaN's, and denormals + * For all of these cases, the decimal fixed_point number should be set to zero + * + * @param integer_rep The bit-casted floating value to extract the exponent from + * @return The stored base-2 exponent, or INT_MIN for special values + */ + CUDF_HOST_DEVICE inline static int get_exp2(IntegralType integer_rep) + { + // First extract the exponent bits and handle its special values. + // To minimize branching, all of these special cases will return INT_MIN. + // For all of these cases, the decimal fixed_point number should be set to zero. + auto const exponent_bits = integer_rep & exponent_mask; + if (exponent_bits == 0) { + // Because of the understood set-bit not stored in the mantissa, it is not possible + // to store the value zero directly. Instead both +/-0 and denormals are represented with + // the exponent bits set to zero. + // Thus it's fastest to just floor (generally unwanted) denormals to zero. + return INT_MIN; + } else if (exponent_bits == exponent_mask) { + //+/-inf and NaN values are stored with all of the exponent bits set. + // As none of these are representable by integers, we'll return the same value for all cases. + return INT_MIN; + } + + // Extract the exponent value: shift the bits down and subtract the bias. + using SignedIntegralType = cuda::std::make_signed_t; + SignedIntegralType const shifted_exponent_bits = exponent_bits >> num_mantissa_bits; + return shifted_exponent_bits - static_cast(exponent_bias); + } + + /** + * @brief Sets the sign bit of a positive floating-point number + * + * @param floating The floating-point value to set the sign of. Must be positive. + * @param is_negative The sign bit to set for the floating-point number + * @return The input floating-point value with the chosen sign + */ + CUDF_HOST_DEVICE inline static FloatingType set_is_negative(FloatingType floating, + bool is_negative) + { + // Convert floating to integer + IntegralType integer_rep = bit_cast_to_integer(floating); + + // Set the sign bit. Note that the input floating-point number must be positive (bit = 0). + integer_rep |= (IntegralType(is_negative) << sign_bit_index); + + // Convert back to float + return bit_cast_to_floating(integer_rep); + } + + /** + * @brief Adds to the base-2 exponent of a floating-point number + * + * @param floating The floating value to add to the exponent of. Must be positive. + * @param exp2 The power-of-2 to add to the floating-point number + * @return The input floating-point value * 2^exp2 + */ + CUDF_HOST_DEVICE inline static FloatingType add_exp2(FloatingType floating, int exp2) + { + // Convert floating to integer + auto integer_rep = bit_cast_to_integer(floating); + + // Extract the currently stored (biased) exponent + auto exponent_bits = integer_rep & exponent_mask; + auto stored_exp2 = exponent_bits >> num_mantissa_bits; + + // Add the additional power-of-2 + stored_exp2 += exp2; + + // Check for exponent over/under-flow. + // Note that the input floating-point number is always positive, so we don't have to + // worry about the sign here; the sign will be set later in set_is_negative() + if (stored_exp2 <= 0) { + return 0.0; + } else if (stored_exp2 >= unshifted_exponent_mask) { + return cuda::std::numeric_limits::infinity(); + } else { + // Clear existing exponent bits and set new ones + exponent_bits = stored_exp2 << num_mantissa_bits; + integer_rep &= (~exponent_mask); + integer_rep |= exponent_bits; + + // Convert back to float + return bit_cast_to_floating(integer_rep); + } + } +}; + +/** + * @brief Determine the number of significant bits in an integer + * + * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t + * @param value The integer whose bits are being counted + * @return The number of significant bits: the # of bits - # of leading zeroes + */ +template || std::is_same_v || + std::is_same_v)> +CUDF_HOST_DEVICE inline int count_significant_bits(T value) +{ +#ifdef __CUDA_ARCH__ + if constexpr (std::is_same_v) { + return 64 - __clzll(static_cast(value)); + } else if constexpr (std::is_same_v) { + return 32 - __clz(static_cast(value)); + } else if constexpr (std::is_same_v) { + // 128 bit type, must break up into high and low components + auto const high_bits = static_cast(value >> 64); + auto const low_bits = static_cast(value); + return 128 - (__clzll(high_bits) + static_cast(high_bits == 0) * __clzll(low_bits)); + } +#else + // Undefined behavior to call __builtin_clzll() with zero in gcc and clang + if (value == 0) { return 0; } + + if constexpr (std::is_same_v) { + return 64 - __builtin_clzll(value); + } else if constexpr (std::is_same_v) { + return 32 - __builtin_clz(value); + } else if constexpr (std::is_same_v) { + // 128 bit type, must break up into high and low components + auto const high_bits = static_cast(value >> 64); + if (high_bits == 0) { + return 64 - __builtin_clzll(static_cast(value)); + } else { + return 128 - __builtin_clzll(high_bits); + } + } +#endif +} + /** * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an * 128bit integer From 3a75f6db18c911d93727d12a0cf5abcdad22efda Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 29 May 2024 15:10:55 -0700 Subject: [PATCH 018/340] Use rapids-build-backend. (#15245) This PR uses `rapids-build-backend` to simplify wheel builds and reduce the complexity of various CI/build scripts. See also: - https://github.com/rapidsai/rapids-build-backend - https://github.com/rapidsai/build-planning/issues/31 Authors: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) - James Lamb (https://github.com/jameslamb) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15245 --- .pre-commit-config.yaml | 2 +- build.sh | 2 +- ci/build_python.sh | 17 ++-- ci/build_wheel.sh | 46 +---------- ci/build_wheel_cudf.sh | 2 +- ci/build_wheel_dask_cudf.sh | 2 +- ci/release/update-version.sh | 4 +- .../all_cuda-118_arch-x86_64.yaml | 6 +- .../all_cuda-122_arch-x86_64.yaml | 6 +- conda/recipes/cudf/meta.yaml | 1 + conda/recipes/cudf_kafka/meta.yaml | 1 + conda/recipes/custreamz/meta.yaml | 4 +- conda/recipes/dask-cudf/meta.yaml | 4 +- dependencies.yaml | 79 ++++++++++++------- python/cudf/cudf/_version.py | 19 ++++- python/cudf/cudf/tests/test_version.py | 12 +++ python/cudf/pyproject.toml | 24 ++++-- python/cudf_kafka/cudf_kafka/_version.py | 16 +++- python/cudf_kafka/pyproject.toml | 22 ++++-- python/cudf_polars/cudf_polars/_version.py | 21 +++++ python/cudf_polars/pyproject.toml | 10 ++- python/custreamz/custreamz/_version.py | 16 +++- .../custreamz/custreamz/tests/test_version.py | 12 +++ python/custreamz/pyproject.toml | 12 ++- python/dask_cudf/dask_cudf/_version.py | 16 +++- .../dask_cudf/dask_cudf/tests/test_version.py | 13 +++ python/dask_cudf/pyproject.toml | 14 +++- 27 files changed, 251 insertions(+), 132 deletions(-) create mode 100644 python/cudf/cudf/tests/test_version.py create mode 100644 python/cudf_polars/cudf_polars/_version.py create mode 100644 python/custreamz/custreamz/tests/test_version.py create mode 100644 python/dask_cudf/dask_cudf/tests/test_version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d3ffc287e9..8865fb48e0d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -129,7 +129,7 @@ repos: ^CHANGELOG.md$ ) - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.13.4 + rev: v1.13.11 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/build.sh b/build.sh index 43bb04f7a18..4291c88ea12 100755 --- a/build.sh +++ b/build.sh @@ -70,7 +70,7 @@ BUILD_PER_THREAD_DEFAULT_STREAM=OFF BUILD_REPORT_METRICS=OFF BUILD_REPORT_INCL_CACHE_STATS=OFF USE_PROPRIETARY_NVCOMP=ON -PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps" +PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true" # Set defaults for vars that may not have been defined externally # FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check diff --git a/ci/build_python.sh b/ci/build_python.sh index 3c2a7761e1a..79e09432779 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -13,14 +13,7 @@ export CMAKE_GENERATOR=Ninja rapids-print-env -package_dir="python" -version=$(rapids-generate-version) -commit=$(git rev-parse HEAD) - -echo "${version}" > VERSION -for package_name in cudf dask_cudf cudf_kafka custreamz; do - sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" ${package_dir}/${package_name}/${package_name}/_version.py -done +rapids-generate-version > ./VERSION rapids-logger "Begin py build" @@ -29,24 +22,24 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) # TODO: Remove `--no-test` flag once importing on a CPU # node works correctly # With boa installed conda build forwards to the boa builder -RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ conda/recipes/cudf -RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/dask-cudf -RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf_kafka -RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index c4b794e81f7..7c1fa705faa 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -3,54 +3,12 @@ set -euo pipefail -package_name=$1 -package_dir=$2 +package_dir=$1 source rapids-configure-sccache source rapids-date-string -version=$(rapids-generate-version) -commit=$(git rev-parse HEAD) - -RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - -# This is the version of the suffix with a preceding hyphen. It's used -# everywhere except in the final wheel name. -PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" - -# Patch project metadata files to include the CUDA version suffix and version override. -pyproject_file="${package_dir}/pyproject.toml" - -sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} -echo "${version}" > VERSION -sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name//-/_}/_version.py" - -# For nightlies we want to ensure that we're pulling in alphas as well. The -# easiest way to do so is to augment the spec with a constraint containing a -# min alpha version that doesn't affect the version bounds but does allow usage -# of alpha versions for that dependency without --pre -alpha_spec='' -if ! rapids-is-release-build; then - alpha_spec=',>=0.0.0a0' -fi - -if [[ ${package_name} == "dask-cudf" ]]; then - sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} - sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file} - sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file} -else - sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file} - # ptxcompiler and cubinlinker aren't version constrained - sed -r -i "s/ptxcompiler\"/ptxcompiler${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} - sed -r -i "s/cubinlinker\"/cubinlinker${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} -fi - -if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then - sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file} - sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file} - sed -i "s/ptxcompiler/pynvjitlink/g" ${pyproject_file} - sed -i "/cubinlinker/d" ${pyproject_file} -fi +rapids-generate-version > ./VERSION cd "${package_dir}" diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index f0886a28fd9..1b563bc499c 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -7,7 +7,7 @@ package_dir="python/cudf" export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON" -./ci/build_wheel.sh cudf ${package_dir} +./ci/build_wheel.sh ${package_dir} python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh index 150fec4e2d7..eb2a91289f7 100755 --- a/ci/build_wheel_dask_cudf.sh +++ b/ci/build_wheel_dask_cudf.sh @@ -5,7 +5,7 @@ set -euo pipefail package_dir="python/dask_cudf" -./ci/build_wheel.sh dask-cudf ${package_dir} +./ci/build_wheel.sh ${package_dir} RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index beeb130f0f1..f629de64905 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -58,10 +58,10 @@ DEPENDENCIES=( ) for DEP in "${DEPENDENCIES[@]}"; do for FILE in dependencies.yaml conda/environments/*.yaml; do - sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}" + sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}" done for FILE in python/*/pyproject.toml; do - sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE} + sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE} done done diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 985f873e5eb..946e2d1cd32 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -27,6 +27,7 @@ dependencies: - cxx-compiler - cython>=3.0.3 - dask-cuda==24.8.* +- dask-cuda==24.8.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -76,9 +77,10 @@ dependencies: - python-confluent-kafka>=1.9.0,<1.10.0a0 - python>=3.9,<3.12 - pytorch>=2.1.0 -- rapids-dask-dependency==24.8.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rapids-dask-dependency==24.8.*,>=0.0.0a0 - rich -- rmm==24.8.* +- rmm==24.8.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.7.0 - scipy diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 3083d1dbb03..f069616ddbe 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -28,6 +28,7 @@ dependencies: - cxx-compiler - cython>=3.0.3 - dask-cuda==24.8.* +- dask-cuda==24.8.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -74,9 +75,10 @@ dependencies: - python-confluent-kafka>=1.9.0,<1.10.0a0 - python>=3.9,<3.12 - pytorch>=2.1.0 -- rapids-dask-dependency==24.8.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rapids-dask-dependency==24.8.*,>=0.0.0a0 - rich -- rmm==24.8.* +- rmm==24.8.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.7.0 - scipy diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index e7245e67659..3cdc2050631 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -61,6 +61,7 @@ requirements: host: - python - cython >=3.0.3 + - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.7.0 - dlpack >=0.8,<1.0 - numpy 1.23 diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 4d91cf6320c..1b0e0e2c236 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -60,6 +60,7 @@ requirements: - cuda-version ={{ cuda_version }} - cudf ={{ version }} - libcudf_kafka ={{ version }} + - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.7.0 {% if cuda_major != "11" %} - cuda-cudart-dev diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 755394e3936..f5ea426e0b1 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -37,6 +37,8 @@ build: requirements: host: - python + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - setuptools - python-confluent-kafka >=1.9.0,<1.10.0a0 - cudf_kafka ={{ version }} - cuda-version ={{ cuda_version }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 16638926492..1e6c0a35a09 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -37,6 +37,8 @@ build: requirements: host: - python + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - setuptools - cuda-version ={{ cuda_version }} run: - python diff --git a/dependencies.yaml b/dependencies.yaml index 3df7cb71a78..8bfa3190b3d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -9,7 +9,6 @@ files: - build_base - build_all - build_cpp - - build_wheels - build_python_common - build_python_cudf - cuda @@ -19,6 +18,8 @@ files: - libarrow_build - notebooks - py_version + - rapids_build_skbuild + - rapids_build_setuptools - run_common - run_cudf - run_dask_cudf @@ -75,11 +76,19 @@ files: - docs - libarrow_run - py_version - py_build_cudf: + py_rapids_build_cudf: output: pyproject pyproject_dir: python/cudf extras: table: build-system + includes: + - rapids_build_skbuild + py_build_cudf: + output: pyproject + pyproject_dir: python/cudf + extras: + table: tool.rapids-build-backend + key: requires includes: - build_base - build_python_common @@ -119,13 +128,13 @@ files: key: cudf-pandas-tests includes: - test_python_cudf_pandas - py_build_cudf_polars: + py_rapids_build_cudf_polars: output: pyproject pyproject_dir: python/cudf_polars extras: table: build-system includes: - - build_wheels + - rapids_build_setuptools py_run_cudf_polars: output: pyproject pyproject_dir: python/cudf_polars @@ -148,7 +157,7 @@ files: extras: table: build-system includes: - - build_wheels + - rapids_build_setuptools py_run_dask_cudf: output: pyproject pyproject_dir: python/dask_cudf @@ -168,11 +177,19 @@ files: includes: - test_python_common - test_python_dask_cudf - py_build_cudf_kafka: + py_rapids_build_cudf_kafka: output: pyproject pyproject_dir: python/cudf_kafka extras: table: build-system + includes: + - rapids_build_skbuild + py_build_cudf_kafka: + output: pyproject + pyproject_dir: python/cudf_kafka + extras: + table: tool.rapids-build-backend + key: requires includes: - build_base - build_python_common @@ -197,7 +214,7 @@ files: extras: table: build-system includes: - - build_wheels + - rapids_build_setuptools py_run_custreamz: output: pyproject pyproject_dir: python/custreamz @@ -276,12 +293,24 @@ dependencies: # Align nvcomp version with rapids-cmake - nvcomp==3.0.6 - spdlog>=1.12.0,<1.13 - build_wheels: + rapids_build_skbuild: + common: + - output_types: [conda, requirements, pyproject] + packages: + - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0 + - output_types: conda + packages: + - scikit-build-core>=0.7.0 + - output_types: [requirements, pyproject] + packages: + - scikit-build-core[pyproject]>=0.7.0 + rapids_build_setuptools: common: - output_types: [requirements, pyproject] packages: - - wheel + - *rapids_build_backend - setuptools + - wheel build_python_common: common: - output_types: [conda, requirements, pyproject] @@ -290,22 +319,16 @@ dependencies: # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==16.1.0.* - - output_types: conda - packages: - - scikit-build-core>=0.7.0 - output_types: pyproject packages: # Hard pin the patch version used during the build. # Sync with conda build constraint & wheel run constraint. - numpy==1.23.* - - output_types: [requirements, pyproject] - packages: - - scikit-build-core[pyproject]>=0.7.0 build_python_cudf: common: - output_types: conda packages: - - &rmm_conda rmm==24.8.* + - &rmm_conda rmm==24.8.*,>=0.0.0a0 - pip - pip: - git+https://github.com/python-streamz/streamz.git@master @@ -321,10 +344,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: &build_python_packages_cu12 - - &rmm_cu12 rmm-cu12==24.8.* + - rmm-cu12==24.8.*,>=0.0.0a0 - matrix: {cuda: "11.*"} packages: &build_python_packages_cu11 - - &rmm_cu11 rmm-cu11==24.8.* + - rmm-cu11==24.8.*,>=0.0.0a0 - {matrix: null, packages: [*rmm_conda] } libarrow_build: common: @@ -568,11 +591,11 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - rmm-cu12==24.8.* + - rmm-cu12==24.8.*,>=0.0.0a0 - pynvjitlink-cu12 - matrix: {cuda: "11.*"} packages: - - rmm-cu11==24.8.* + - rmm-cu11==24.8.*,>=0.0.0a0 - cubinlinker-cu11 - ptxcompiler-cu11 - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]} @@ -585,7 +608,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - rapids-dask-dependency==24.8.* + - rapids-dask-dependency==24.8.*,>=0.0.0a0 run_custreamz: common: - output_types: conda @@ -671,13 +694,13 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==24.8.* + - dask-cuda==24.8.*,>=0.0.0a0 - *numba depends_on_cudf: common: - output_types: conda packages: - - &cudf_conda cudf==24.8.* + - &cudf_conda cudf==24.8.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -689,16 +712,16 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cudf-cu12==24.8.* + - cudf-cu12==24.8.*,>=0.0.0a0 - matrix: {cuda: "11.*"} packages: - - cudf-cu11==24.8.* + - cudf-cu11==24.8.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_conda]} depends_on_cudf_kafka: common: - output_types: conda packages: - - &cudf_kafka_conda cudf_kafka==24.8.* + - &cudf_kafka_conda cudf_kafka==24.8.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -710,10 +733,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cudf_kafka-cu12==24.8.* + - cudf_kafka-cu12==24.8.*,>=0.0.0a0 - matrix: {cuda: "11.*"} packages: - - cudf_kafka-cu11==24.8.* + - cudf_kafka-cu11==24.8.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_kafka_conda]} depends_on_cupy: common: diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py index ecf6ddd8e3b..7dd732b4905 100644 --- a/python/cudf/cudf/_version.py +++ b/python/cudf/cudf/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,19 @@ import importlib.resources __version__ = ( - importlib.resources.files("cudf").joinpath("VERSION").read_text().strip() + importlib.resources.files(__package__) + .joinpath("VERSION") + .read_text() + .strip() ) -__git_commit__ = "" +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__git_commit__", "__version__"] diff --git a/python/cudf/cudf/tests/test_version.py b/python/cudf/cudf/tests/test_version.py new file mode 100644 index 00000000000..8c10cc20a9a --- /dev/null +++ b/python/cudf/cudf/tests/test_version.py @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import cudf + + +def test_version_constants_are_populated(): + # __git_commit__ will only be non-empty in a built distribution + assert isinstance(cudf.__git_commit__, str) + + # __version__ should always be non-empty + assert isinstance(cudf.__version__, str) + assert len(cudf.__version__) > 0 diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index e6517825083..9ad02fed044 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -1,14 +1,9 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. [build-system] -build-backend = "scikit_build_core.build" +build-backend = "rapids_build_backend.build" requires = [ - "cmake>=3.26.4", - "cython>=3.0.3", - "ninja", - "numpy==1.23.*", - "pyarrow==16.1.0.*", - "rmm==24.8.*", + "rapids-build-backend>=0.3.0,<0.4.0.dev0", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -36,7 +31,7 @@ dependencies = [ "ptxcompiler", "pyarrow>=16.1.0,<16.2.0a0", "rich", - "rmm==24.8.*", + "rmm==24.8.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -122,6 +117,19 @@ skip = [ "__init__.py", ] +[tool.rapids-build-backend] +build-backend = "scikit_build_core.build" +commit-file = "cudf/GIT_COMMIT" +dependencies-file = "../../dependencies.yaml" +requires = [ + "cmake>=3.26.4", + "cython>=3.0.3", + "ninja", + "numpy==1.23.*", + "pyarrow==16.1.0.*", + "rmm==24.8.*,>=0.0.0a0", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. + [tool.scikit-build] build-dir = "build/{wheel_tag}" cmake.build-type = "Release" diff --git a/python/cudf_kafka/cudf_kafka/_version.py b/python/cudf_kafka/cudf_kafka/_version.py index 5adab566da0..7dd732b4905 100644 --- a/python/cudf_kafka/cudf_kafka/_version.py +++ b/python/cudf_kafka/cudf_kafka/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,19 @@ import importlib.resources __version__ = ( - importlib.resources.files("cudf_kafka") + importlib.resources.files(__package__) .joinpath("VERSION") .read_text() .strip() ) -__git_commit__ = "" +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__git_commit__", "__version__"] diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 9233d0e92dd..1bc04742a73 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -1,13 +1,9 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. [build-system] -build-backend = "scikit_build_core.build" +build-backend = "rapids_build_backend.build" requires = [ - "cmake>=3.26.4", - "cython>=3.0.3", - "ninja", - "numpy==1.23.*", - "pyarrow==16.1.0.*", + "rapids-build-backend>=0.3.0,<0.4.0.dev0", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -22,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cudf==24.8.*", + "cudf==24.8.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.optional-dependencies] @@ -100,3 +96,15 @@ wheel.packages = ["cudf_kafka"] provider = "scikit_build_core.metadata.regex" input = "cudf_kafka/VERSION" regex = "(?P.*)" + +[tool.rapids-build-backend] +build-backend = "scikit_build_core.build" +commit-file = "cudf_kafka/GIT_COMMIT" +dependencies-file = "../../dependencies.yaml" +requires = [ + "cmake>=3.26.4", + "cython>=3.0.3", + "ninja", + "numpy==1.23.*", + "pyarrow==16.1.0.*", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_polars/cudf_polars/_version.py b/python/cudf_polars/cudf_polars/_version.py new file mode 100644 index 00000000000..d906f11cb00 --- /dev/null +++ b/python/cudf_polars/cudf_polars/_version.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import importlib.resources + +__version__ = ( + importlib.resources.files(__package__).joinpath("VERSION").read_text().strip() +) +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__git_commit__", "__version__"] diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 00fde6c0e05..86b0ad414fd 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. [build-system] -build-backend = "setuptools.build_meta" +build-backend = "rapids_build_backend.build" requires = [ + "rapids-build-backend>=0.3.0,<0.4.0.dev0", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -18,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cudf==24.8.*", + "cudf==24.8.*,>=0.0.0a0", "polars>=0.20.24", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -169,3 +170,8 @@ rapids = ["rmm", "cudf"] [tool.ruff.format] docstring-code-format = true + +[tool.rapids-build-backend] +build-backend = "setuptools.build_meta" +commit-file = "cudf_polars/GIT_COMMIT" +dependencies-file = "../../dependencies.yaml" diff --git a/python/custreamz/custreamz/_version.py b/python/custreamz/custreamz/_version.py index 0f545f95f2b..7dd732b4905 100644 --- a/python/custreamz/custreamz/_version.py +++ b/python/custreamz/custreamz/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,19 @@ import importlib.resources __version__ = ( - importlib.resources.files("custreamz") + importlib.resources.files(__package__) .joinpath("VERSION") .read_text() .strip() ) -__git_commit__ = "" +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__git_commit__", "__version__"] diff --git a/python/custreamz/custreamz/tests/test_version.py b/python/custreamz/custreamz/tests/test_version.py new file mode 100644 index 00000000000..cda2dd92155 --- /dev/null +++ b/python/custreamz/custreamz/tests/test_version.py @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import custreamz + + +def test_version_constants_are_populated(): + # __git_commit__ will only be non-empty in a built distribution + assert isinstance(custreamz.__git_commit__, str) + + # __version__ should always be non-empty + assert isinstance(custreamz.__version__, str) + assert len(custreamz.__version__) > 0 diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index f7e5698900a..e004a8f5219 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -1,8 +1,9 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. [build-system] -build-backend = "setuptools.build_meta" +build-backend = "rapids_build_backend.build" requires = [ + "rapids-build-backend>=0.3.0,<0.4.0.dev0", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -19,8 +20,8 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "confluent-kafka>=1.9.0,<1.10.0a0", - "cudf==24.8.*", - "cudf_kafka==24.8.*", + "cudf==24.8.*,>=0.0.0a0", + "cudf_kafka==24.8.*,>=0.0.0a0", "streamz", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -45,6 +46,11 @@ test = [ [project.urls] Homepage = "https://github.com/rapidsai/cudf" +[tool.rapids-build-backend] +build-backend = "setuptools.build_meta" +commit-file = "custreamz/COMMIT_FILE" +dependencies-file = "../../dependencies.yaml" + [tool.setuptools] license-files = ["LICENSE"] zip-safe = false diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py index 0dd62854a4e..7dd732b4905 100644 --- a/python/dask_cudf/dask_cudf/_version.py +++ b/python/dask_cudf/dask_cudf/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,19 @@ import importlib.resources __version__ = ( - importlib.resources.files("dask_cudf") + importlib.resources.files(__package__) .joinpath("VERSION") .read_text() .strip() ) -__git_commit__ = "" +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__git_commit__", "__version__"] diff --git a/python/dask_cudf/dask_cudf/tests/test_version.py b/python/dask_cudf/dask_cudf/tests/test_version.py new file mode 100644 index 00000000000..e2724e530ba --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/test_version.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + + +import dask_cudf + + +def test_version_constants_are_populated(): + # __git_commit__ will only be non-empty in a built distribution + assert isinstance(dask_cudf.__git_commit__, str) + + # __version__ should always be non-empty + assert isinstance(dask_cudf.__version__, str) + assert len(dask_cudf.__version__) > 0 diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index e353eac06b9..6b5d5ccc412 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -1,8 +1,9 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. [build-system] -build-backend = "setuptools.build_meta" +build-backend = "rapids_build_backend.build" requires = [ + "rapids-build-backend>=0.3.0,<0.4.0.dev0", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -18,12 +19,12 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cudf==24.8.*", + "cudf==24.8.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<2.0a0", "pandas>=2.0,<2.2.3dev0", - "rapids-dask-dependency==24.8.*", + "rapids-dask-dependency==24.8.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -44,7 +45,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ - "dask-cuda==24.8.*", + "dask-cuda==24.8.*,>=0.0.0a0", "numba>=0.57", "pytest-cov", "pytest-xdist", @@ -54,6 +55,11 @@ test = [ [project.urls] Homepage = "https://github.com/rapidsai/cudf" +[tool.rapids-build-backend] +build-backend = "setuptools.build_meta" +commit-file = "dask_cudf/GIT_COMMIT" +dependencies-file = "../../dependencies.yaml" + [tool.setuptools] license-files = ["LICENSE"] From 5ce95f05eeae469f4d46516b3cf6fe19902623f6 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 30 May 2024 09:24:58 -0400 Subject: [PATCH 019/340] Update interleave lists column for large strings (#15877) Fixes the `compute_string_sizes_and_interleave_lists_fn` functor to use `column_device_view::element()` method to access string row contents instead of using the strings offsets. This removes the need to add specific offsetalator logic to the logic. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15877 --- cpp/src/lists/interleave_columns.cu | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index be8fad62412..45ae3671d4e 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -166,8 +166,6 @@ struct compute_string_sizes_and_interleave_lists_fn { lists_col.child(lists_column_view::offsets_column_index).template data() + lists_col.offset(); auto const& str_col = lists_col.child(lists_column_view::child_column_index); - auto const str_offsets = - str_col.child(strings_column_view::offsets_column_index).template data(); // The range of indices of the strings within the source list. auto const start_str_idx = list_offsets[list_id]; @@ -181,13 +179,13 @@ struct compute_string_sizes_and_interleave_lists_fn { size_type write_idx = dst_list_offsets[idx]; for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) { - auto const offset = str_offsets[read_idx]; - auto const size = str_offsets[read_idx + 1] - offset; - string_index_pair result = {nullptr, size}; - if (str_col.is_valid(read_idx)) { - result.first = size > 0 ? str_col.template head() + offset : ""; + if (str_col.is_null(read_idx)) { + indices[write_idx] = string_index_pair{nullptr, 0}; + continue; } - indices[write_idx] = result; + auto const d_str = str_col.element(read_idx); + indices[write_idx] = d_str.empty() ? string_index_pair{"", 0} + : string_index_pair{d_str.data(), d_str.size_bytes()}; } } }; From 3e9cff2e3ee4f744bcbf80c6f7ad3e5ebcdf94f7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 30 May 2024 09:33:06 -0400 Subject: [PATCH 020/340] Change thrust::count_if call to raw kernel in strings split APIs (#15762) Fixes calls to `thrust::count_if` in strings split APIs to better handle large strings. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Paul Mattione (https://github.com/pmattione-nvidia) URL: https://github.com/rapidsai/cudf/pull/15762 --- cpp/src/strings/split/split.cu | 1 + cpp/src/strings/split/split.cuh | 59 +++++++++++++++---- cpp/tests/CMakeLists.txt | 1 + .../large_strings/split_strings_tests.cpp | 53 +++++++++++++++++ 4 files changed, 103 insertions(+), 11 deletions(-) create mode 100644 cpp/tests/large_strings/split_strings_tests.cpp diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index 2c6a0b2cf22..bc01a46ca6d 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -34,6 +34,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 69a11aabfcd..ae3c0b3aa12 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -30,12 +30,9 @@ #include #include -#include #include -#include #include #include -#include #include namespace cudf::strings::detail { @@ -297,6 +294,44 @@ std::unique_ptr create_offsets_from_positions(strings_column_view const& rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Count the number of delimiters in a strings column + * + * @tparam Tokenizer Functor containing `is_delimiter` function + * @tparam block_size Number of threads per block + * @tparam bytes_per_thread Number of bytes processed per thread + * + * @param tokenizer For checking delimiters + * @param d_offsets Offsets for the strings column + * @param chars_bytes Number of bytes in the strings column + * @param d_output Result of the count + */ +template +CUDF_KERNEL void count_delimiters_kernel(Tokenizer tokenizer, + cudf::detail::input_offsetalator d_offsets, + int64_t chars_bytes, + int64_t* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const byte_idx = static_cast(idx) * bytes_per_thread; + auto const lane_idx = static_cast(threadIdx.x); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + int64_t count = 0; + // each thread processes multiple bytes + for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) { + count += tokenizer.is_delimiter(i, d_offsets, chars_bytes); + } + auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum()); + + if ((lane_idx == 0) && (total > 0)) { + cuda::atomic_ref ref{*d_output}; + ref.fetch_add(total, cuda::std::memory_order_relaxed); + } +} + /** * @brief Helper function used by split/rsplit and split_record/rsplit_record * @@ -326,17 +361,19 @@ std::pair, rmm::device_uvector> split cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); // count the number of delimiters in the entire column - auto const delimiter_count = - thrust::count_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(chars_bytes), - [tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) { - return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); - }); + rmm::device_scalar d_count(0, stream); + constexpr int64_t block_size = 512; + constexpr size_type bytes_per_thread = 4; + auto const num_blocks = util::div_rounding_up_safe( + util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); + count_delimiters_kernel + <<>>( + tokenizer, d_offsets, chars_bytes, d_count.data()); + // Create a vector of every delimiter position in the chars column. // These may include overlapping or otherwise out-of-bounds delimiters which // will be resolved during token processing. - auto delimiter_positions = rmm::device_uvector(delimiter_count, stream); + auto delimiter_positions = rmm::device_uvector(d_count.value(stream), stream); auto d_positions = delimiter_positions.data(); cudf::detail::copy_if_safe( thrust::counting_iterator(0), diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c6ab8aa021a..2f2c12f265c 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -575,6 +575,7 @@ ConfigureTest( large_strings/merge_tests.cpp large_strings/parquet_tests.cpp large_strings/reshape_tests.cpp + large_strings/split_strings_tests.cpp GPUS 1 PERCENT 100 ) diff --git a/cpp/tests/large_strings/split_strings_tests.cpp b/cpp/tests/large_strings/split_strings_tests.cpp new file mode 100644 index 00000000000..320fb222241 --- /dev/null +++ b/cpp/tests/large_strings/split_strings_tests.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include + +struct StringsSplitTest : public cudf::test::StringsLargeTest {}; + +TEST_F(StringsSplitTest, Split) +{ + auto const expected = this->long_column(); + auto const view = cudf::column_view(expected); + auto const multiplier = 10; + auto const separator = cudf::string_scalar("|"); + auto const input = cudf::strings::concatenate( + cudf::table_view(std::vector(multiplier, view)), separator); + + { + auto result = cudf::strings::split(cudf::strings_column_view(input->view()), separator); + for (auto c : result->view()) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected); + } + } + + auto lc = cudf::strings::split_record(cudf::strings_column_view(input->view()), separator); + auto lv = cudf::lists_column_view(lc->view()); + auto sv = cudf::strings_column_view(lv.child()); + EXPECT_EQ(sv.size(), view.size() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); +} From e95894fc305a2833374933ecbce07be997d4c545 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 30 May 2024 15:31:20 +0100 Subject: [PATCH 021/340] Executor for polars logical plans (#15504) This builds out the infrastructure for executing polars logical plans using pylibcudf. See `docs/overview.md` in the `cudf_polars` subdirectory for some installation guidance. Deliberately not fully fleshing out packaging and so forth yet. Test coverage is incomplete but growing. I'd like to get this in so other people can build on top of it. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15504 --- dependencies.yaml | 2 +- python/cudf/cudf/_lib/pylibcudf/types.pyx | 3 + python/cudf_polars/cudf_polars/callback.py | 56 + .../cudf_polars/containers/__init__.py | 12 + .../cudf_polars/containers/column.py | 119 ++ .../cudf_polars/containers/dataframe.py | 223 ++++ .../cudf_polars/containers/scalar.py | 23 + .../cudf_polars/cudf_polars/dsl/__init__.py | 8 + python/cudf_polars/cudf_polars/dsl/expr.py | 1038 +++++++++++++++++ python/cudf_polars/cudf_polars/dsl/ir.py | 879 ++++++++++++++ .../cudf_polars/cudf_polars/dsl/translate.py | 403 +++++++ .../cudf_polars/testing/__init__.py | 8 + .../cudf_polars/testing/asserts.py | 76 ++ .../cudf_polars/cudf_polars/utils/__init__.py | 8 + .../cudf_polars/cudf_polars/utils/dtypes.py | 89 ++ .../cudf_polars/cudf_polars/utils/sorting.py | 49 + python/cudf_polars/docs/overview.md | 174 +++ python/cudf_polars/pyproject.toml | 10 +- .../cudf_polars/tests/expressions/test_agg.py | 63 + .../tests/expressions/test_filter.py | 20 + .../tests/expressions/test_gather.py | 19 + .../tests/expressions/test_numeric_binops.py | 106 ++ python/cudf_polars/tests/test_distinct.py | 30 + python/cudf_polars/tests/test_extcontext.py | 23 + python/cudf_polars/tests/test_groupby.py | 78 ++ python/cudf_polars/tests/test_hconcat.py | 19 + python/cudf_polars/tests/test_hstack.py | 32 + python/cudf_polars/tests/test_join.py | 57 + python/cudf_polars/tests/test_scan.py | 98 ++ python/cudf_polars/tests/test_select.py | 38 + python/cudf_polars/tests/test_slice.py | 34 + python/cudf_polars/tests/test_sort.py | 42 + python/cudf_polars/tests/test_union.py | 37 + 33 files changed, 3874 insertions(+), 2 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/callback.py create mode 100644 python/cudf_polars/cudf_polars/containers/__init__.py create mode 100644 python/cudf_polars/cudf_polars/containers/column.py create mode 100644 python/cudf_polars/cudf_polars/containers/dataframe.py create mode 100644 python/cudf_polars/cudf_polars/containers/scalar.py create mode 100644 python/cudf_polars/cudf_polars/dsl/__init__.py create mode 100644 python/cudf_polars/cudf_polars/dsl/expr.py create mode 100644 python/cudf_polars/cudf_polars/dsl/ir.py create mode 100644 python/cudf_polars/cudf_polars/dsl/translate.py create mode 100644 python/cudf_polars/cudf_polars/testing/__init__.py create mode 100644 python/cudf_polars/cudf_polars/testing/asserts.py create mode 100644 python/cudf_polars/cudf_polars/utils/__init__.py create mode 100644 python/cudf_polars/cudf_polars/utils/dtypes.py create mode 100644 python/cudf_polars/cudf_polars/utils/sorting.py create mode 100644 python/cudf_polars/docs/overview.md create mode 100644 python/cudf_polars/tests/expressions/test_agg.py create mode 100644 python/cudf_polars/tests/expressions/test_filter.py create mode 100644 python/cudf_polars/tests/expressions/test_gather.py create mode 100644 python/cudf_polars/tests/expressions/test_numeric_binops.py create mode 100644 python/cudf_polars/tests/test_distinct.py create mode 100644 python/cudf_polars/tests/test_extcontext.py create mode 100644 python/cudf_polars/tests/test_groupby.py create mode 100644 python/cudf_polars/tests/test_hconcat.py create mode 100644 python/cudf_polars/tests/test_hstack.py create mode 100644 python/cudf_polars/tests/test_join.py create mode 100644 python/cudf_polars/tests/test_scan.py create mode 100644 python/cudf_polars/tests/test_select.py create mode 100644 python/cudf_polars/tests/test_slice.py create mode 100644 python/cudf_polars/tests/test_sort.py create mode 100644 python/cudf_polars/tests/test_union.py diff --git a/dependencies.yaml b/dependencies.yaml index 8bfa3190b3d..38ec30a8033 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -603,7 +603,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=0.20.24 + - polars>=0.20.30 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index de10196e289..a5248ad0a1f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -51,6 +51,9 @@ cdef class DataType: self.c_obj == (other).c_obj ) + def __hash__(self): + return hash((self.c_obj.id(), self.c_obj.scale())) + @staticmethod cdef DataType from_libcudf(data_type dt): """Create a DataType from a libcudf data_type. diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py new file mode 100644 index 00000000000..aabb8498ce2 --- /dev/null +++ b/python/cudf_polars/cudf_polars/callback.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Callback for the polars collect function to execute on device.""" + +from __future__ import annotations + +from functools import partial +from typing import TYPE_CHECKING + +import nvtx + +from cudf_polars.dsl.translate import translate_ir + +if TYPE_CHECKING: + import polars as pl + + from cudf_polars.dsl.ir import IR + +__all__: list[str] = ["execute_with_cudf"] + + +def _callback( + ir: IR, + with_columns: list[str] | None, + pyarrow_predicate: str | None, + n_rows: int | None, +) -> pl.DataFrame: + assert with_columns is None + assert pyarrow_predicate is None + assert n_rows is None + with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"): + return ir.evaluate(cache={}).to_polars() + + +def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None: + """ + A post optimization callback that attempts to execute the plan with cudf. + + Parameters + ---------- + nt + NodeTraverser + + raise_on_fail + Should conversion raise an exception rather than continuing + without setting a callback. + + The NodeTraverser is mutated if the libcudf executor can handle the plan. + """ + try: + with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): + nt.set_udf(partial(_callback, translate_ir(nt))) + except NotImplementedError: + if raise_on_fail: + raise diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py new file mode 100644 index 00000000000..ef9d9ca61b6 --- /dev/null +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Containers of concrete data.""" + +from __future__ import annotations + +__all__: list[str] = ["DataFrame", "Column", "Scalar"] + +from cudf_polars.containers.column import Column +from cudf_polars.containers.dataframe import DataFrame +from cudf_polars.containers.scalar import Scalar diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py new file mode 100644 index 00000000000..49034b5f5c8 --- /dev/null +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""A column, with some properties.""" + +from __future__ import annotations + +import functools +from typing import TYPE_CHECKING + +import cudf._lib.pylibcudf as plc + +if TYPE_CHECKING: + from typing_extensions import Self + +__all__: list[str] = ["Column"] + + +class Column: + """A column, a name, and sortedness.""" + + obj: plc.Column + name: str + is_sorted: plc.types.Sorted + order: plc.types.Order + null_order: plc.types.NullOrder + + def __init__(self, column: plc.Column, name: str): + self.obj = column + self.name = name + self.is_sorted = plc.types.Sorted.NO + self.order = plc.types.Order.ASCENDING + self.null_order = plc.types.NullOrder.BEFORE + + def sorted_like(self, like: Column, /) -> Self: + """ + Copy sortedness properties from a column onto self. + + Parameters + ---------- + like + The column to copy sortedness metadata from. + + Returns + ------- + Self with metadata set. + + See Also + -------- + set_sorted + """ + return self.set_sorted( + is_sorted=like.is_sorted, order=like.order, null_order=like.null_order + ) + + def set_sorted( + self, + *, + is_sorted: plc.types.Sorted, + order: plc.types.Order, + null_order: plc.types.NullOrder, + ) -> Self: + """ + Modify sortedness metadata in place. + + Parameters + ---------- + is_sorted + Is the column sorted + order + The order if sorted + null_order + Where nulls sort, if sorted + + Returns + ------- + Self with metadata set. + """ + self.is_sorted = is_sorted + self.order = order + self.null_order = null_order + return self + + def copy(self, *, new_name: str | None = None) -> Self: + """ + Return a shallow copy of the column. + + Parameters + ---------- + new_name + Optional new name for the copied column. + + Returns + ------- + New column sharing data with self. + """ + return type(self)( + self.obj, self.name if new_name is None else new_name + ).sorted_like(self) + + def mask_nans(self) -> Self: + """Return a copy of self with nans masked out.""" + if self.nan_count > 0: + raise NotImplementedError + return self.copy() + + @functools.cached_property + def nan_count(self) -> int: + """Return the number of NaN values in the column.""" + if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): + return 0 + return plc.interop.to_arrow( + plc.reduce.reduce( + plc.unary.is_nan(self.obj), + plc.aggregation.sum(), + # TODO: pylibcudf needs to have a SizeType DataType singleton + plc.DataType(plc.TypeId.INT32), + ) + ).as_py() diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py new file mode 100644 index 00000000000..de21a280020 --- /dev/null +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -0,0 +1,223 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""A dataframe, with some properties.""" + +from __future__ import annotations + +from functools import cached_property +from typing import TYPE_CHECKING + +import polars as pl + +import cudf._lib.pylibcudf as plc + +from cudf_polars.containers.column import Column + +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence, Set + + from typing_extensions import Self + + import cudf + + from cudf_polars.containers.scalar import Scalar + + +__all__: list[str] = ["DataFrame"] + + +class DataFrame: + """A representation of a dataframe.""" + + columns: list[Column] + scalars: list[Scalar] + table: plc.Table | None + + def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None: + self.columns = list(columns) + self._column_map = {c.name: c for c in self.columns} + self.scalars = list(scalars) + if len(scalars) == 0: + self.table = plc.Table([c.obj for c in columns]) + else: + self.table = None + + def copy(self) -> Self: + """Return a shallow copy of self.""" + return type(self)(self.columns, self.scalars) + + def to_polars(self) -> pl.DataFrame: + """Convert to a polars DataFrame.""" + assert len(self.scalars) == 0 + return pl.from_arrow( + plc.interop.to_arrow( + self.table, + [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], + ) + ) + + @cached_property + def column_names_set(self) -> frozenset[str]: + """Return the column names as a set.""" + return frozenset(c.name for c in self.columns) + + @cached_property + def column_names(self) -> list[str]: + """Return a list of the column names.""" + return [c.name for c in self.columns] + + @cached_property + def num_columns(self) -> int: + """Number of columns.""" + return len(self.columns) + + @cached_property + def num_rows(self) -> int: + """Number of rows.""" + if self.table is None: + raise ValueError("Number of rows of frame with scalars makes no sense") + return self.table.num_rows() + + @classmethod + def from_cudf(cls, df: cudf.DataFrame) -> Self: + """Create from a cudf dataframe.""" + return cls( + [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()], + [], + ) + + @classmethod + def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: + """ + Create from a pylibcudf table. + + Parameters + ---------- + table + Pylibcudf table to obtain columns from + names + Names for the columns + + Returns + ------- + New dataframe sharing data with the input table. + + Raises + ------ + ValueError if the number of provided names does not match the + number of columns in the table. + """ + # TODO: strict=True when we drop py39 + if table.num_columns() != len(names): + raise ValueError("Mismatching name and table length.") + return cls([Column(c, name) for c, name in zip(table.columns(), names)], []) + + def sorted_like( + self, like: DataFrame, /, *, subset: Set[str] | None = None + ) -> Self: + """ + Copy sortedness from a dataframe onto self. + + Parameters + ---------- + like + The dataframe to copy from + subset + Optional subset of columns from which to copy data. + + Returns + ------- + Self with metadata set. + + Raises + ------ + ValueError if there is a name mismatch between self and like. + """ + if like.column_names != self.column_names: + raise ValueError("Can only copy from identically named frame") + subset = self.column_names_set if subset is None else subset + self.columns = [ + c.sorted_like(other) if c.name in subset else c + for c, other in zip(self.columns, like.columns) + ] + return self + + def with_columns(self, columns: Sequence[Column]) -> Self: + """ + Return a new dataframe with extra columns. + + Parameters + ---------- + columns + Columns to add + + Returns + ------- + New dataframe + + Notes + ----- + If column names overlap, newer names replace older ones. + """ + return type(self)([*self.columns, *columns], self.scalars) + + def discard_columns(self, names: Set[str]) -> Self: + """Drop columns by name.""" + return type(self)( + [c for c in self.columns if c.name not in names], self.scalars + ) + + def select(self, names: Sequence[str]) -> Self: + """Select columns by name returning DataFrame.""" + want = set(names) + if not want.issubset(self.column_names_set): + raise ValueError("Can't select missing names") + return type(self)([self._column_map[name] for name in names], self.scalars) + + def replace_columns(self, *columns: Column) -> Self: + """Return a new dataframe with columns replaced by name.""" + new = {c.name: c for c in columns} + if not set(new).issubset(self.column_names_set): + raise ValueError("Cannot replace with non-existing names") + return type(self)([new.get(c.name, c) for c in self.columns], self.scalars) + + def rename_columns(self, mapping: Mapping[str, str]) -> Self: + """Rename some columns.""" + return type(self)( + [c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars + ) + + def select_columns(self, names: Set[str]) -> list[Column]: + """Select columns by name.""" + return [c for c in self.columns if c.name in names] + + def filter(self, mask: Column) -> Self: + """Return a filtered table given a mask.""" + table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj) + return type(self).from_table(table, self.column_names).sorted_like(self) + + def slice(self, zlice: tuple[int, int] | None) -> Self: + """ + Slice a dataframe. + + Parameters + ---------- + zlice + optional, tuple of start and length, negative values of start + treated as for python indexing. If not provided, returns self. + + Returns + ------- + New dataframe (if zlice is not None) other self (if it is) + """ + if zlice is None: + return self + start, length = zlice + if start < 0: + start += self.num_rows + # Polars slice takes an arbitrary positive integer and slice + # to the end of the frame if it is larger. + end = min(start + length, self.num_rows) + (table,) = plc.copying.slice(self.table, [start, end]) + return type(self).from_table(table, self.column_names).sorted_like(self) diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py new file mode 100644 index 00000000000..fc97d0fd9c2 --- /dev/null +++ b/python/cudf_polars/cudf_polars/containers/scalar.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""A scalar, with some properties.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import cudf._lib.pylibcudf as plc + +__all__: list[str] = ["Scalar"] + + +class Scalar: + """A scalar, and a name.""" + + __slots__ = ("obj", "name") + obj: plc.Scalar + + def __init__(self, scalar: plc.Scalar): + self.obj = scalar diff --git a/python/cudf_polars/cudf_polars/dsl/__init__.py b/python/cudf_polars/cudf_polars/dsl/__init__.py new file mode 100644 index 00000000000..804c5ada566 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""The domain-specific language (DSL) for the polars executor.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py new file mode 100644 index 00000000000..249cc3775f7 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -0,0 +1,1038 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +""" +DSL nodes for the polars expression language. + +An expression node is a function, `DataFrame -> Column` or `DataFrame -> Scalar`. + +The evaluation context is provided by a LogicalPlan node, and can +affect the evaluation rule as well as providing the dataframe input. +In particular, the interpretation of the expression language in a +`GroupBy` node is groupwise, rather than whole frame. +""" + +from __future__ import annotations + +import enum +from enum import IntEnum +from functools import partial, reduce +from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple + +import pyarrow as pa + +from polars.polars import _expr_nodes as pl_expr + +import cudf._lib.pylibcudf as plc + +from cudf_polars.containers import Column, Scalar +from cudf_polars.utils import sorting + +if TYPE_CHECKING: + from collections.abc import Sequence + + import polars.type_aliases as pl_types + + from cudf_polars.containers import DataFrame + +__all__ = [ + "Expr", + "NamedExpr", + "Literal", + "Col", + "BooleanFunction", + "StringFunction", + "Sort", + "SortBy", + "Gather", + "Filter", + "RollingWindow", + "GroupedRollingWindow", + "Cast", + "Agg", + "BinOp", +] + + +class ExecutionContext(IntEnum): + FRAME = enum.auto() + GROUPBY = enum.auto() + ROLLING = enum.auto() + + +class AggInfo(NamedTuple): + requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]] + + +class Expr: + """ + An abstract expression object. + + This contains a (potentially empty) tuple of child expressions, + along with non-child data. For uniform reconstruction and + implementation of hashing and equality schemes, child classes need + to provide a certain amount of metadata when they are defined. + Specifically, the ``_non_child`` attribute must list, in-order, + the names of the slots that are passed to the constructor. The + constructor must take arguments in the order ``(*_non_child, + *children).`` + """ + + __slots__ = ("dtype", "_hash_value", "_repr_value") + dtype: plc.DataType + """Data type of the expression.""" + _hash_value: int + """Caching slot for the hash of the expression.""" + _repr_value: str + """Caching slot for repr of the expression.""" + children: tuple[Expr, ...] = () + """Children of the expression.""" + _non_child: ClassVar[tuple[str, ...]] = ("dtype",) + """Names of non-child data (not Exprs) for reconstruction.""" + + # Constructor must take arguments in order (*_non_child, *children) + def __init__(self, dtype: plc.DataType) -> None: + self.dtype = dtype + + def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence: + return (*(getattr(self, attr) for attr in self._non_child), *children) + + def get_hash(self) -> int: + """ + Return the hash of this expr. + + Override this in subclasses, rather than __hash__. + + Returns + ------- + The integer hash value. + """ + return hash((type(self), self._ctor_arguments(self.children))) + + def __hash__(self): + """Hash of an expression with caching.""" + try: + return self._hash_value + except AttributeError: + self._hash_value = self.get_hash() + return self._hash_value + + def is_equal(self, other: Any) -> bool: + """ + Equality of two expressions. + + Override this in subclasses, rather than __eq__. + + Parameter + --------- + other + object to compare to + + Returns + ------- + True if the two expressions are equal, false otherwise. + """ + if type(self) is not type(other): + return False + return self._ctor_arguments(self.children) == other._ctor_arguments( + other.children + ) + + def __eq__(self, other): + """Equality of expressions.""" + if type(self) != type(other) or hash(self) != hash(other): + return False + else: + return self.is_equal(other) + + def __ne__(self, other): + """Inequality of expressions.""" + return not self.__eq__(other) + + def __repr__(self): + """String representation of an expression with caching.""" + try: + return self._repr_value + except AttributeError: + args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children)) + self._repr_value = f"{type(self).__name__}({args})" + return self._repr_value + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: # TODO: return type is a lie for Literal + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame that will provide columns. + context + What context are we performing this evaluation in? + mapping + Substitution mapping from expressions to Columns, used to + override the evaluation of a given expression if we're + performing a simple rewritten evaluation. + + Notes + ----- + Do not call this function directly, but rather + :meth:`evaluate` which handles the mapping lookups. + + The typed return value of :class:`Column` is not true when + evaluating :class:`Literal` nodes (which instead produce + :class:`Scalar` objects). However, these duck-type to having a + pylibcudf container object inside them, and usually they end + up appearing in binary expressions which pylibcudf handles + appropriately since there are overloads for (column, scalar) + pairs. We don't have to handle (scalar, scalar) in binops + since the polars optimizer has a constant-folding pass. + + Returns + ------- + Column representing the evaluation of the expression (or maybe + a scalar). + + Raises + ------ + NotImplementedError if we couldn't evaluate the expression. + Ideally all these are returned during translation to the IR, + but for now we are not perfect. + """ + raise NotImplementedError(f"Evaluation of {type(self).__name__}") + + def evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: # TODO: return type is a lie for Literal + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame that will provide columns. + context + What context are we performing this evaluation in? + mapping + Substitution mapping from expressions to Columns, used to + override the evaluation of a given expression if we're + performing a simple rewritten evaluation. + + Notes + ----- + Individual subclasses should implement :meth:`do_allocate`, + this method provides logic to handle lookups in the + substitution mapping. + + Returns + ------- + Column representing the evaluation of the expression (or maybe + a scalar, annoying!). + + Raises + ------ + NotImplementedError if we couldn't evaluate the expression. + Ideally all these are returned during translation to the IR, + but for now we are not perfect. + """ + if mapping is None: + return self.do_evaluate(df, context=context, mapping=mapping) + try: + return mapping[self] + except KeyError: + return self.do_evaluate(df, context=context, mapping=mapping) + + def collect_agg(self, *, depth: int) -> AggInfo: + """ + Collect information about aggregations in groupbys. + + Parameters + ---------- + depth + The depth of aggregating (reduction or sampling) + expressions we are currently at. + + Returns + ------- + Aggregation info describing the expression to aggregate in the + groupby. + + Raises + ------ + NotImplementedError if we can't currently perform the + aggregation request (for example nested aggregations like + ``a.max().min()``). + """ + raise NotImplementedError( + f"Collecting aggregation info for {type(self).__name__}" + ) + + +class NamedExpr(Expr): + __slots__ = ("name", "children") + _non_child = ("dtype", "name") + + def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None: + super().__init__(dtype) + self.name = name + self.children = (value,) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + return Column( + child.evaluate(df, context=context, mapping=mapping).obj, self.name + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + (value,) = self.children + return value.collect_agg(depth=depth) + + +class Literal(Expr): + __slots__ = ("value",) + _non_child = ("dtype", "value") + value: pa.Scalar + + def __init__(self, dtype: plc.DataType, value: Any) -> None: + super().__init__(dtype) + self.value = pa.scalar(value) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # TODO: obey dtype + obj = plc.interop.from_arrow(self.value) + return Scalar(obj) # type: ignore + + +class Col(Expr): + __slots__ = ("name",) + _non_child = ("dtype", "name") + name: str + + def __init__(self, dtype: plc.DataType, name: str) -> None: + self.dtype = dtype + self.name = name + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + return df._column_map[self.name] + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + + +class Len(Expr): + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # TODO: type is wrong, and dtype + return df.num_rows # type: ignore + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: polars returns a uint, not an int for count + return AggInfo( + [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)] + ) + + +class BooleanFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + + def __init__(self, dtype: plc.DataType, name: str, options: tuple, *children: Expr): + super().__init__(dtype) + self.options = options + self.name = name + self.children = children + if ( + self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All) + and not self.options[0] + ): + # With ignore_nulls == False, polars uses Kleene logic + raise NotImplementedError(f"Kleene logic for {self.name}") + if self.name in ( + pl_expr.BooleanFunction.IsFinite, + pl_expr.BooleanFunction.IsInfinite, + pl_expr.BooleanFunction.IsIn, + ): + raise NotImplementedError(f"{self.name}") + + @staticmethod + def _distinct( + column: Column, + *, + keep: plc.stream_compaction.DuplicateKeepOption, + source_value: plc.Scalar, + target_value: plc.Scalar, + ) -> Column: + table = plc.Table([column.obj]) + indices = plc.stream_compaction.distinct_indices( + table, + keep, + # TODO: polars doesn't expose options for these + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + return Column( + plc.copying.scatter( + [source_value], + indices, + plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]), + ).columns()[0], + column.name, + ) + + _BETWEEN_OPS: ClassVar[ + dict[ + pl_types.ClosedInterval, + tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator], + ] + ] = { + "none": ( + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS, + ), + "left": ( + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.LESS, + ), + "right": ( + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS_EQUAL, + ), + "both": ( + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.LESS_EQUAL, + ), + } + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.BooleanFunction.Any: + (column,) = columns + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1 + ) + elif self.name == pl_expr.BooleanFunction.All: + (column,) = columns + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1 + ) + if self.name == pl_expr.BooleanFunction.IsNull: + (column,) = columns + return Column(plc.unary.is_null(column.obj), column.name) + elif self.name == pl_expr.BooleanFunction.IsNotNull: + (column,) = columns + return Column(plc.unary.is_valid(column.obj), column.name) + elif self.name == pl_expr.BooleanFunction.IsNan: + # TODO: copy over null mask since is_nan(null) => null in polars + (column,) = columns + return Column(plc.unary.is_nan(column.obj), column.name) + elif self.name == pl_expr.BooleanFunction.IsNotNan: + # TODO: copy over null mask since is_not_nan(null) => null in polars + (column,) = columns + return Column(plc.unary.is_not_nan(column.obj), column.name) + elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + ) + elif self.name == pl_expr.BooleanFunction.IsLastDistinct: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, + source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + ) + elif self.name == pl_expr.BooleanFunction.IsUnique: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + ) + elif self.name == pl_expr.BooleanFunction.IsDuplicated: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + source_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + target_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + ) + elif self.name == pl_expr.BooleanFunction.AllHorizontal: + name = columns[0].name + if any(c.obj.null_count() > 0 for c in columns): + raise NotImplementedError("Kleene logic for all_horizontal") + return Column( + reduce( + partial( + plc.binaryop.binary_operation, + op=plc.binaryop.BinaryOperator.BITWISE_AND, + output_type=self.dtype, + ), + (c.obj for c in columns), + ), + name, + ) + elif self.name == pl_expr.BooleanFunction.AnyHorizontal: + name = columns[0].name + if any(c.obj.null_count() > 0 for c in columns): + raise NotImplementedError("Kleene logic for any_horizontal") + return Column( + reduce( + partial( + plc.binaryop.binary_operation, + op=plc.binaryop.BinaryOperator.BITWISE_OR, + output_type=self.dtype, + ), + (c.obj for c in columns), + ), + name, + ) + elif self.name == pl_expr.BooleanFunction.IsBetween: + column, lo, hi = columns + (closed,) = self.options + lop, rop = self._BETWEEN_OPS[closed] + return Column( + plc.binaryop.binary_operation( + plc.binaryop.binary_operation( + column.obj, lo.obj, lop, output_type=self.dtype + ), + plc.binaryop.binary_operation( + column.obj, hi.obj, rop, output_type=self.dtype + ), + plc.binaryop.BinaryOperator.LOGICAL_AND, + self.dtype, + ), + column.name, + ) + else: + raise NotImplementedError(f"BooleanFunction {self.name}") + + +class StringFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.StringFunction, + options: tuple, + *children: Expr, + ): + super().__init__(dtype) + self.options = options + self.name = name + self.children = children + if self.name not in ( + pl_expr.StringFunction.Lowercase, + pl_expr.StringFunction.Uppercase, + pl_expr.StringFunction.EndsWith, + pl_expr.StringFunction.StartsWith, + ): + raise NotImplementedError(f"String function {self.name}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.StringFunction.Lowercase: + (column,) = columns + return Column(plc.strings.case.to_lower(column.obj), column.name) + elif self.name == pl_expr.StringFunction.Uppercase: + (column,) = columns + return Column(plc.strings.case.to_upper(column.obj), column.name) + elif self.name == pl_expr.StringFunction.EndsWith: + column, suffix = columns + return Column( + plc.strings.find.ends_with(column.obj, suffix.obj), column.name + ) + elif self.name == pl_expr.StringFunction.StartsWith: + column, suffix = columns + return Column( + plc.strings.find.starts_with(column.obj, suffix.obj), column.name + ) + else: + raise NotImplementedError(f"StringFunction {self.name}") + + +class Sort(Expr): + __slots__ = ("options", "children") + _non_child = ("dtype", "options") + + def __init__( + self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr + ): + super().__init__(dtype) + self.options = options + self.children = (column,) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + (stable, nulls_last, descending) = self.options + order, null_order = sorting.sort_order( + [descending], nulls_last=nulls_last, num_keys=1 + ) + do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort + table = do_sort(plc.Table([column.obj]), order, null_order) + return Column(table.columns()[0], column.name).set_sorted( + is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0] + ) + + +class SortBy(Expr): + __slots__ = ("options", "children") + _non_child = ("dtype", "options") + + def __init__( + self, + dtype: plc.DataType, + options: tuple[bool, bool, tuple[bool]], + column: Expr, + *by: Expr, + ): + super().__init__(dtype) + self.options = options + self.children = (column, *by) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + column, *by = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + (stable, nulls_last, descending) = self.options + order, null_order = sorting.sort_order( + descending, nulls_last=nulls_last, num_keys=len(by) + ) + do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + table = do_sort( + plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order + ) + return Column(table.columns()[0], column.name) + + +class Gather(Expr): + __slots__ = ("children",) + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): + super().__init__(dtype) + self.children = (values, indices) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + values, indices = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + lo, hi = plc.reduce.minmax(indices.obj) + lo = plc.interop.to_arrow(lo).as_py() + hi = plc.interop.to_arrow(hi).as_py() + n = df.num_rows + if hi >= n or lo < -n: + raise ValueError("gather indices are out of bounds") + if indices.obj.null_count(): + bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY + obj = plc.replace.replace_nulls( + indices.obj, + plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()), + ) + else: + bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK + obj = indices.obj + table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) + return Column(table.columns()[0], values.name) + + +class Filter(Expr): + __slots__ = ("children",) + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): + super().__init__(dtype) + self.children = (values, indices) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + values, mask = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + table = plc.stream_compaction.apply_boolean_mask( + plc.Table([values.obj]), mask.obj + ) + return Column(table.columns()[0], values.name).sorted_like(values) + + +class RollingWindow(Expr): + __slots__ = ("options", "children") + _non_child = ("dtype", "options") + + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr): + super().__init__(dtype) + self.options = options + self.children = (agg,) + + +class GroupedRollingWindow(Expr): + __slots__ = ("options", "children") + _non_child = ("dtype", "options") + + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr): + super().__init__(dtype) + self.options = options + self.children = (agg, *by) + + +class Cast(Expr): + __slots__ = ("children",) + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, value: Expr): + super().__init__(dtype) + self.children = (value,) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + return Column(plc.unary.cast(column.obj, self.dtype), column.name).sorted_like( + column + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: Could do with sort-based groupby and segmented filter + (child,) = self.children + return child.collect_agg(depth=depth) + + +class Agg(Expr): + __slots__ = ("name", "options", "op", "request", "children") + _non_child = ("dtype", "name", "options") + + def __init__( + self, dtype: plc.DataType, name: str, options: Any, value: Expr + ) -> None: + super().__init__(dtype) + # TODO: fix polars name + if name == "nunique": + name = "n_unique" + self.name = name + self.options = options + self.children = (value,) + if name not in Agg._SUPPORTED: + raise NotImplementedError(f"Unsupported aggregation {name=}") + # TODO: nan handling in groupby case + if name == "min": + req = plc.aggregation.min() + elif name == "max": + req = plc.aggregation.max() + elif name == "median": + req = plc.aggregation.median() + elif name == "n_unique": + # TODO: datatype of result + req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) + elif name == "first" or name == "last": + req = None + elif name == "mean": + req = plc.aggregation.mean() + elif name == "sum": + req = plc.aggregation.sum() + elif name == "std": + # TODO: handle nans + req = plc.aggregation.std(ddof=options) + elif name == "var": + # TODO: handle nans + req = plc.aggregation.variance(ddof=options) + elif name == "count": + req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) + else: + raise NotImplementedError + self.request = req + op = getattr(self, f"_{name}", None) + if op is None: + op = partial(self._reduce, request=req) + elif name in {"min", "max"}: + op = partial(op, propagate_nans=options) + elif name in {"count", "first", "last"}: + pass + else: + raise AssertionError + self.op = op + + _SUPPORTED: ClassVar[frozenset[str]] = frozenset( + [ + "min", + "max", + "median", + "n_unique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + ] + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth >= 1: + raise NotImplementedError("Nested aggregations in groupby") + (child,) = self.children + ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests + if self.request is None: + raise NotImplementedError(f"Aggregation {self.name} in groupby") + return AggInfo([(expr, self.request, self)]) + + def _reduce( + self, column: Column, *, request: plc.aggregation.Aggregation + ) -> Column: + return Column( + plc.Column.from_scalar( + plc.reduce.reduce(column.obj, request, self.dtype), + 1, + ), + column.name, + ) + + def _count(self, column: Column) -> Column: + # TODO: dtype handling + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(column.obj.size() - column.obj.null_count()), + ), + 1, + ), + column.name, + ) + + def _min(self, column: Column, *, propagate_nans: bool) -> Column: + if propagate_nans and column.nan_count > 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan")), data_type=self.dtype + ), + 1, + ), + column.name, + ) + if column.nan_count > 0: + column = column.mask_nans() + return self._reduce(column, request=plc.aggregation.min()) + + def _max(self, column: Column, *, propagate_nans: bool) -> Column: + if propagate_nans and column.nan_count > 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan")), data_type=self.dtype + ), + 1, + ), + column.name, + ) + if column.nan_count > 0: + column = column.mask_nans() + return self._reduce(column, request=plc.aggregation.max()) + + def _first(self, column: Column) -> Column: + return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name) + + def _last(self, column: Column) -> Column: + n = column.obj.size() + return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name) + + def do_evaluate( + self, + df, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if context is not ExecutionContext.FRAME: + raise NotImplementedError(f"Agg in context {context}") + (child,) = self.children + return self.op(child.evaluate(df, context=context, mapping=mapping)) + + +class BinOp(Expr): + __slots__ = ("op", "children") + _non_child = ("dtype", "op") + + def __init__( + self, + dtype: plc.DataType, + op: plc.binaryop.BinaryOperator, + left: Expr, + right: Expr, + ) -> None: + super().__init__(dtype) + self.op = op + self.children = (left, right) + + _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { + pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, + pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, + pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, + pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, + pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS, + pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL, + pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER, + pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL, + pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD, + pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB, + pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL, + pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV, + pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV, + pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV, + pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD, + pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND, + pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR, + pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR, + pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND, + pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, + } + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + left, right = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype), + "what", + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth == 1: + # inside aggregation, need to pre-evaluate, + # groupby construction has checked that we don't have + # nested aggs, so stop the recursion and return ourselves + # for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + left_info, right_info = ( + child.collect_agg(depth=depth) for child in self.children + ) + requests = [*left_info.requests, *right_info.requests] + # TODO: Hack, if there were no reductions inside this + # binary expression then we want to pre-evaluate and + # collect ourselves. Otherwise we want to collect the + # aggregations inside and post-evaluate. This is a bad way + # of checking that we are in case 1. + if all( + agg.kind() == plc.aggregation.Kind.COLLECT_LIST + for _, agg, _ in requests + ): + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + return AggInfo( + [*left_info.requests, *right_info.requests], + ) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py new file mode 100644 index 00000000000..d630b40f600 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -0,0 +1,879 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +""" +DSL nodes for the LogicalPlan of polars. + +An IR node is either a source, normal, or a sink. Respectively they +can be considered as functions: + +- source: `IO () -> DataFrame` +- normal: `DataFrame -> DataFrame` +- sink: `DataFrame -> IO ()` +""" + +from __future__ import annotations + +import itertools +import types +from dataclasses import dataclass +from functools import cache +from typing import TYPE_CHECKING, Any, Callable, ClassVar + +import pyarrow as pa +from typing_extensions import assert_never + +import polars as pl + +import cudf +import cudf._lib.pylibcudf as plc + +import cudf_polars.dsl.expr as expr +from cudf_polars.containers import Column, DataFrame +from cudf_polars.utils import sorting + +if TYPE_CHECKING: + from typing import Literal + + +__all__ = [ + "IR", + "PythonScan", + "Scan", + "Cache", + "DataFrameScan", + "Select", + "GroupBy", + "Join", + "HStack", + "Distinct", + "Sort", + "Slice", + "Filter", + "Projection", + "MapFunction", + "Union", + "HConcat", + "ExtContext", +] + + +@dataclass(slots=True) +class IR: + """Abstract plan node, representing an unevaluated dataframe.""" + + schema: dict[str, plc.DataType] + """Mapping from column names to their data types.""" + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """ + Evaluate the node and return a dataframe. + + Parameters + ---------- + cache + Mapping from cached node ids to constructed DataFrames. + Used to implement evaluation of the `Cache` node. + + Returns + ------- + DataFrame (on device) representing the evaluation of this plan + node. + + Raises + ------ + NotImplementedError if we couldn't evaluate things. Ideally + this should not occur, since the translation phase should pick + up things that we cannot handle. + """ + raise NotImplementedError + + +@dataclass(slots=True) +class PythonScan(IR): + """Representation of input from a python function.""" + + options: Any + """Arbitrary options.""" + predicate: expr.Expr | None + """Filter to apply to the constructed dataframe before returning it.""" + + +@dataclass(slots=True) +class Scan(IR): + """Input from files.""" + + typ: Any + """What type of file are we reading? Parquet, CSV, etc...""" + paths: list[str] + """List of paths to read from.""" + file_options: Any + """Options for reading the file. + + Attributes are: + - ``with_columns: list[str]`` of projected columns to return. + - ``n_rows: int``: Number of rows to read. + - ``row_index: tuple[name, offset] | None``: Add an integer index + column with given name. + """ + predicate: expr.Expr | None + """Mask to apply to the read dataframe.""" + + def __post_init__(self): + """Validate preconditions.""" + if self.file_options.n_rows is not None: + raise NotImplementedError("row limit in scan") + if self.typ not in ("csv", "parquet"): + raise NotImplementedError(f"Unhandled scan type: {self.typ}") + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + options = self.file_options + with_columns = options.with_columns + row_index = options.row_index + if self.typ == "csv": + df = DataFrame.from_cudf( + cudf.concat( + [cudf.read_csv(p, usecols=with_columns) for p in self.paths] + ) + ) + elif self.typ == "parquet": + df = DataFrame.from_cudf( + cudf.read_parquet(self.paths, columns=with_columns) + ) + else: + assert_never(self.typ) + if row_index is not None: + name, offset = row_index + # TODO: dtype + step = plc.interop.from_arrow(pa.scalar(1)) + init = plc.interop.from_arrow(pa.scalar(offset)) + index = Column( + plc.filling.sequence(df.num_rows, init, step), name + ).set_sorted( + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.AFTER, + ) + df = DataFrame([index, *df.columns], []) + # TODO: should be true, but not the case until we get + # cudf-classic out of the loop for IO since it converts date32 + # to datetime. + # assert all( + # c.obj.type() == dtype + # for c, dtype in zip(df.columns, self.schema.values()) + # ) + if self.predicate is None: + return df + else: + mask = self.predicate.evaluate(df) + return df.filter(mask) + + +@dataclass(slots=True) +class Cache(IR): + """ + Return a cached plan node. + + Used for CSE at the plan level. + """ + + key: int + """The cache key.""" + value: IR + """The unevaluated node to cache.""" + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + try: + return cache[self.key] + except KeyError: + return cache.setdefault(self.key, self.value.evaluate(cache=cache)) + + +@dataclass(slots=True) +class DataFrameScan(IR): + """ + Input from an existing polars DataFrame. + + This typically arises from ``q.collect().lazy()`` + """ + + df: Any + """Polars LazyFrame object.""" + projection: list[str] + """List of columns to project out.""" + predicate: expr.Expr | None + """Mask to apply.""" + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + pdf = pl.DataFrame._from_pydf(self.df) + if self.projection is not None: + pdf = pdf.select(self.projection) + # TODO: goes away when libcudf supports large strings + table = pdf.to_arrow() + schema = table.schema + for i, field in enumerate(schema): + if field.type == pa.large_string(): + # TODO: Nested types + schema = schema.set(i, pa.field(field.name, pa.string())) + table = table.cast(schema) + df = DataFrame.from_table( + plc.interop.from_arrow(table), list(self.schema.keys()) + ) + assert all( + c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values()) + ) + if self.predicate is not None: + mask = self.predicate.evaluate(df) + return df.filter(mask) + else: + return df + + +@dataclass(slots=True) +class Select(IR): + """Produce a new dataframe selecting given expressions from an input.""" + + df: IR + """Input dataframe.""" + cse: list[expr.Expr] + """ + List of common subexpressions that will appear in the selected expressions. + + These must be evaluated before the returned expressions. + """ + expr: list[expr.Expr] + """List of expressions to evaluate to form the new dataframe.""" + + def evaluate(self, *, cache: dict[int, DataFrame]): + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + df = df.with_columns([e.evaluate(df) for e in self.cse]) + return DataFrame([e.evaluate(df) for e in self.expr], []) + + +@dataclass(slots=True) +class Reduce(IR): + """ + Produce a new dataframe selecting given expressions from an input. + + This is a special case of :class:`Select` where all outputs are a single row. + """ + + df: IR + """Input dataframe.""" + expr: list[expr.Expr] + """List of expressions to evaluate to form the new dataframe.""" + + def evaluate(self, *, cache: dict[int, DataFrame]): + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + return DataFrame([e.evaluate(df) for e in self.expr], []) + + +def placeholder_column(n: int): + """ + Produce a placeholder pylibcudf column with NO BACKING DATA. + + Parameters + ---------- + n + Number of rows the column will advertise + + Returns + ------- + pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER. + + Notes + ----- + This is used to avoid allocating data for count aggregations. + """ + return plc.Column( + plc.DataType(plc.TypeId.INT8), + n, + plc.gpumemoryview( + types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)}) + ), + None, + 0, + 0, + [], + ) + + +@dataclass(slots=False) +class GroupBy(IR): + """Perform a groupby.""" + + df: IR + """Input dataframe.""" + agg_requests: list[expr.Expr] + """List of expressions to evaluate groupwise.""" + keys: list[expr.Expr] + """List of expressions forming the keys.""" + maintain_order: bool + """Should the order of the input dataframe be maintained?""" + options: Any + """Options controlling style of groupby.""" + + @staticmethod + def check_agg(agg: expr.Expr) -> int: + """ + Determine if we can handle an aggregation expression. + + Parameters + ---------- + agg + Expression to check + + Returns + ------- + depth of nesting + + Raises + ------ + NotImplementedError for unsupported expression nodes. + """ + if isinstance(agg, (expr.NamedExpr, expr.BinOp, expr.Cast)): + return max(GroupBy.check_agg(child) for child in agg.children) + elif isinstance(agg, expr.Agg): + if agg.name == "implode": + raise NotImplementedError("implode in groupby") + return 1 + max(GroupBy.check_agg(child) for child in agg.children) + elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)): + return 0 + else: + raise NotImplementedError(f"No handler for {agg=}") + + def __post_init__(self): + """Check whether all the aggregations are implemented.""" + if self.options.rolling is None and self.maintain_order: + raise NotImplementedError("Maintaining order in groupby") + if self.options.rolling: + raise NotImplementedError("rolling window/groupby") + if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests): + raise NotImplementedError("Nested aggregations in groupby") + self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + keys = [k.evaluate(df) for k in self.keys] + # TODO: use sorted information, need to expose column_order + # and null_precedence in pylibcudf groupby constructor + # sorted = ( + # plc.types.Sorted.YES + # if all(k.is_sorted for k in keys) + # else plc.types.Sorted.NO + # ) + grouper = plc.groupby.GroupBy( + plc.Table([k.obj for k in keys]), + null_handling=plc.types.NullPolicy.INCLUDE, + ) + # TODO: uniquify + requests = [] + replacements = [] + for info in self.agg_infos: + for pre_eval, req, rep in info.requests: + if pre_eval is None: + col = placeholder_column(df.num_rows) + else: + col = pre_eval.evaluate(df).obj + requests.append(plc.groupby.GroupByRequest(col, [req])) + replacements.append(rep) + group_keys, raw_tables = grouper.aggregate(requests) + raw_columns = [] + for i, table in enumerate(raw_tables): + (column,) = table.columns() + raw_columns.append(Column(column, f"column{i}")) + mapping = dict(zip(replacements, raw_columns)) + result_keys = [Column(gk, k.name) for gk, k in zip(group_keys.columns(), keys)] + result_subs = DataFrame(raw_columns, []) + results = [ + req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests + ] + return DataFrame([*result_keys, *results], []).slice(self.options.slice) + + +@dataclass(slots=True) +class Join(IR): + """A join of two dataframes.""" + + left: IR + """Left frame.""" + right: IR + """Right frame.""" + left_on: list[expr.Expr] + """List of expressions used as keys in the left frame.""" + right_on: list[expr.Expr] + """List of expressions used as keys in the right frame.""" + options: tuple[ + Literal["inner", "left", "full", "leftsemi", "leftanti"], + bool, + tuple[int, int] | None, + str | None, + bool, + ] + """ + tuple of options: + - how: join type + - join_nulls: do nulls compare equal? + - slice: optional slice to perform after joining. + - suffix: string suffix for right columns if names match + - coalesce: should key columns be coalesced (only makes sense for outer joins) + """ + + def __post_init__(self): + """Validate preconditions.""" + if self.options[0] == "cross": + raise NotImplementedError("cross join not implemented") + + @cache + @staticmethod + def _joiners( + how: Literal["inner", "left", "full", "leftsemi", "leftanti"], + ) -> tuple[ + Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None + ]: + if how == "inner": + return ( + plc.join.inner_join, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + elif how == "left": + return ( + plc.join.left_join, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + plc.copying.OutOfBoundsPolicy.NULLIFY, + ) + elif how == "full": + return ( + plc.join.full_join, + plc.copying.OutOfBoundsPolicy.NULLIFY, + plc.copying.OutOfBoundsPolicy.NULLIFY, + ) + elif how == "leftsemi": + return ( + plc.join.left_semi_join, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + None, + ) + elif how == "leftanti": + return ( + plc.join.left_anti_join, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + None, + ) + else: + assert_never(how) + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + left = self.left.evaluate(cache=cache) + right = self.right.evaluate(cache=cache) + left_on = DataFrame([e.evaluate(left) for e in self.left_on], []) + right_on = DataFrame([e.evaluate(right) for e in self.right_on], []) + how, join_nulls, zlice, suffix, coalesce = self.options + null_equality = ( + plc.types.NullEquality.EQUAL + if join_nulls + else plc.types.NullEquality.UNEQUAL + ) + suffix = "_right" if suffix is None else suffix + join_fn, left_policy, right_policy = Join._joiners(how) + if right_policy is None: + # Semi join + lg = join_fn(left_on.table, right_on.table, null_equality) + left = left.replace_columns(*left_on.columns) + table = plc.copying.gather(left.table, lg, left_policy) + result = DataFrame.from_table(table, left.column_names) + else: + lg, rg = join_fn(left_on.table, right_on.table, null_equality) + left = left.replace_columns(*left_on.columns) + right = right.replace_columns(*right_on.columns) + if coalesce and how == "inner": + right = right.discard_columns(right_on.column_names_set) + left = DataFrame.from_table( + plc.copying.gather(left.table, lg, left_policy), left.column_names + ) + right = DataFrame.from_table( + plc.copying.gather(right.table, rg, right_policy), right.column_names + ) + if coalesce and how != "inner": + left = left.replace_columns( + *( + Column( + plc.replace.replace_nulls(left_col.obj, right_col.obj), + left_col.name, + ) + for left_col, right_col in zip( + left.select_columns(left_on.column_names_set), + right.select_columns(right_on.column_names_set), + ) + ) + ) + right = right.discard_columns(right_on.column_names_set) + right = right.rename_columns( + { + name: f"{name}{suffix}" + for name in right.column_names + if name in left.column_names_set + } + ) + result = left.with_columns(right.columns) + return result.slice(zlice) + + +@dataclass(slots=True) +class HStack(IR): + """Add new columns to a dataframe.""" + + df: IR + """Input dataframe.""" + cse: list[expr.Expr] + """ + List of common subexpressions that will appear in the selected expressions. + + These must be evaluated before the returned expressions. + """ + columns: list[expr.Expr] + """List of expressions to produce new columns.""" + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + ctx = df.copy().with_columns([e.evaluate(df) for e in self.cse]) + return df.with_columns([c.evaluate(ctx) for c in self.columns]) + + +@dataclass(slots=True) +class Distinct(IR): + """Produce a new dataframe with distinct rows.""" + + df: IR + """Input dataframe.""" + keep: plc.stream_compaction.DuplicateKeepOption + """Which rows to keep.""" + subset: set[str] | None + """Which columns to inspect when computing distinct rows.""" + zlice: tuple[int, int] | None + """Optional slice to perform after compaction.""" + stable: bool + """Should order be preserved?""" + + _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = { + "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, + "none": plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY, + } + + def __init__(self, schema: dict, df: IR, options: Any): + self.schema = schema + self.df = df + (keep, subset, maintain_order, zlice) = options + self.keep = Distinct._KEEP_MAP[keep] + self.subset = set(subset) if subset is not None else None + self.stable = maintain_order + self.zlice = zlice + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + if self.subset is None: + indices = list(range(df.num_columns)) + else: + indices = [i for i, k in enumerate(df.column_names) if k in self.subset] + keys_sorted = all(df.columns[i].is_sorted for i in indices) + if keys_sorted: + table = plc.stream_compaction.unique( + df.table, + indices, + self.keep, + plc.types.NullEquality.EQUAL, + ) + else: + distinct = ( + plc.stream_compaction.stable_distinct + if self.stable + else plc.stream_compaction.distinct + ) + table = distinct( + df.table, + indices, + self.keep, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + result = DataFrame( + [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], [] + ) + if keys_sorted or self.stable: + result = result.sorted_like(df) + return result.slice(self.zlice) + + +@dataclass(slots=True) +class Sort(IR): + """Sort a dataframe.""" + + df: IR + """Input.""" + by: list[expr.Expr] + """List of expressions to produce sort keys.""" + do_sort: Callable[..., plc.Table] + """pylibcudf sorting function.""" + zlice: tuple[int, int] | None + """Optional slice to apply after sorting.""" + order: list[plc.types.Order] + """Order keys should be sorted in.""" + null_order: list[plc.types.NullOrder] + """Where nulls sort to.""" + + def __init__( + self, + schema: dict, + df: IR, + by: list[expr.Expr], + options: Any, + zlice: tuple[int, int] | None, + ): + self.schema = schema + self.df = df + self.by = by + self.zlice = zlice + stable, nulls_last, descending = options + self.order, self.null_order = sorting.sort_order( + descending, nulls_last=nulls_last, num_keys=len(by) + ) + self.do_sort = ( + plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + ) + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + sort_keys = [k.evaluate(df) for k in self.by] + names = {c.name: i for i, c in enumerate(df.columns)} + # TODO: More robust identification here. + keys_in_result = [ + i + for k in sort_keys + if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj + ] + table = self.do_sort( + df.table, + plc.Table([k.obj for k in sort_keys]), + self.order, + self.null_order, + ) + columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)] + # If a sort key is in the result table, set the sortedness property + for k, i in enumerate(keys_in_result): + columns[i] = columns[i].set_sorted( + is_sorted=plc.types.Sorted.YES, + order=self.order[k], + null_order=self.null_order[k], + ) + return DataFrame(columns, []).slice(self.zlice) + + +@dataclass(slots=True) +class Slice(IR): + """Slice a dataframe.""" + + df: IR + """Input.""" + offset: int + """Start of the slice.""" + length: int + """Length of the slice.""" + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + return df.slice((self.offset, self.length)) + + +@dataclass(slots=True) +class Filter(IR): + """Filter a dataframe with a boolean mask.""" + + df: IR + """Input.""" + mask: expr.Expr + """Expression evaluating to a mask.""" + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + return df.filter(self.mask.evaluate(df)) + + +@dataclass(slots=True) +class Projection(IR): + """Select a subset of columns from a dataframe.""" + + df: IR + """Input.""" + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + # This can reorder things. + return df.select(list(self.schema.keys())) + + +@dataclass(slots=True) +class MapFunction(IR): + """Apply some function to a dataframe.""" + + df: IR + """Input.""" + name: str + """Function name.""" + options: Any + """Arbitrary options, interpreted per function.""" + + _NAMES: ClassVar[frozenset[str]] = frozenset( + [ + "drop_nulls", + "rechunk", + "merge_sorted", + "rename", + "explode", + ] + ) + + def __post_init__(self): + """Validate preconditions.""" + if self.name not in MapFunction._NAMES: + raise NotImplementedError(f"Unhandled map function {self.name}") + if self.name == "explode": + (to_explode,) = self.options + if len(to_explode) > 1: + # TODO: straightforward, but need to error check + # polars requires that all to-explode columns have the + # same sub-shapes + raise NotImplementedError("Explode with more than one column") + elif self.name == "merge_sorted": + assert isinstance(self.df, Union) + (key_column,) = self.options + if key_column not in self.df.dfs[0].schema: + raise ValueError(f"Key column {key_column} not found") + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + if self.name == "merge_sorted": + # merge_sorted operates on Union inputs + # but if we evaluate the Union then we can't unpick the + # pieces, so we dive inside and evaluate the pieces by hand + assert isinstance(self.df, Union) + first, *rest = (c.evaluate(cache=cache) for c in self.df.dfs) + (key_column,) = self.options + if not all(first.column_names == r.column_names for r in rest): + raise ValueError("DataFrame shapes/column names don't match") + # Already validated that key_column is in column names + index = first.column_names.index(key_column) + return DataFrame.from_table( + plc.merge.merge_sorted( + [first.table, *(df.table for df in rest)], + [index], + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.BEFORE], + ), + first.column_names, + ).sorted_like(first, subset={key_column}) + elif self.name == "rechunk": + # No-op in our data model + return self.df.evaluate(cache=cache) + elif self.name == "drop_nulls": + df = self.df.evaluate(cache=cache) + (subset,) = self.options + subset = set(subset) + indices = [i for i, name in enumerate(df.column_names) if name in subset] + return DataFrame.from_table( + plc.stream_compaction.drop_nulls(df.table, indices, len(indices)), + df.column_names, + ).sorted_like(df) + elif self.name == "rename": + df = self.df.evaluate(cache=cache) + # final tag is "swapping" which is useful for the + # optimiser (it blocks some pushdown operations) + old, new, _ = self.options + return df.rename_columns(dict(zip(old, new))) + elif self.name == "explode": + df = self.df.evaluate(cache=cache) + ((to_explode,),) = self.options + index = df.column_names.index(to_explode) + subset = df.column_names_set - {to_explode} + return DataFrame.from_table( + plc.lists.explode_outer(df.table, index), df.column_names + ).sorted_like(df, subset=subset) + else: + raise AssertionError("Should never be reached") + + +@dataclass(slots=True) +class Union(IR): + """Concatenate dataframes vertically.""" + + dfs: list[IR] + """List of inputs.""" + zlice: tuple[int, int] | None + """Optional slice to apply after concatenation.""" + + def __post_init__(self): + """Validated preconditions.""" + schema = self.dfs[0].schema + if not all(s.schema == schema for s in self.dfs[1:]): + raise ValueError("Schema mismatch") + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + # TODO: only evaluate what we need if we have a slice + dfs = [df.evaluate(cache=cache) for df in self.dfs] + return DataFrame.from_table( + plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names + ).slice(self.zlice) + + +@dataclass(slots=True) +class HConcat(IR): + """Concatenate dataframes horizontally.""" + + dfs: list[IR] + """List of inputs.""" + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + dfs = [df.evaluate(cache=cache) for df in self.dfs] + columns, scalars = zip(*((df.columns, df.scalars) for df in dfs)) + return DataFrame( + list(itertools.chain.from_iterable(columns)), + list(itertools.chain.from_iterable(scalars)), + ) + + +@dataclass(slots=True) +class ExtContext(IR): + """ + Concatenate dataframes horizontally. + + Prefer HConcat, since this is going to be deprecated on the polars side. + """ + + df: IR + """Input.""" + extra: list[IR] + """List of extra inputs.""" + + def __post_init__(self): + """Validate preconditions.""" + raise NotImplementedError( + "ExtContext will be deprecated, use horizontal concat instead." + ) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py new file mode 100644 index 00000000000..b3d0edf183f --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -0,0 +1,403 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Translate polars IR representation to ours.""" + +from __future__ import annotations + +from contextlib import AbstractContextManager, nullcontext +from functools import singledispatch +from typing import Any + +from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir + +import cudf._lib.pylibcudf as plc # noqa: TCH002, singledispatch register needs this name defined. + +from cudf_polars.dsl import expr, ir +from cudf_polars.utils import dtypes + +__all__ = ["translate_ir", "translate_expr"] + + +class set_node(AbstractContextManager): + """Run a block with current node set in the visitor.""" + + __slots__ = ("n", "visitor") + + def __init__(self, visitor, n: int): + self.visitor = visitor + self.n = n + + def __enter__(self): + n = self.visitor.get_node() + self.visitor.set_node(self.n) + self.n = n + + def __exit__(self, *args): + self.visitor.set_node(self.n) + + +noop_context: nullcontext = nullcontext() + + +@singledispatch +def _translate_ir(node: Any, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + raise NotImplementedError(f"Translation for {type(node).__name__}") + + +@_translate_ir.register +def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.PythonScan( + schema, + node.options, + translate_expr(visitor, n=node.predicate) + if node.predicate is not None + else None, + ) + + +@_translate_ir.register +def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Scan( + schema, + node.scan_type, + node.paths, + node.file_options, + translate_expr(visitor, n=node.predicate) + if node.predicate is not None + else None, + ) + + +@_translate_ir.register +def _(node: pl_ir.Cache, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) + + +@_translate_ir.register +def _( + node: pl_ir.DataFrameScan, visitor: Any, schema: dict[str, plc.DataType] +) -> ir.IR: + return ir.DataFrameScan( + schema, + node.df, + node.projection, + translate_expr(visitor, n=node.selection) + if node.selection is not None + else None, + ) + + +@_translate_ir.register +def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] + exprs = [translate_expr(visitor, n=e) for e in node.expr] + return ir.Select(schema, inp, cse_exprs, exprs) + + +@_translate_ir.register +def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + aggs = [translate_expr(visitor, n=e) for e in node.aggs] + keys = [translate_expr(visitor, n=e) for e in node.keys] + return ir.GroupBy( + schema, + inp, + aggs, + keys, + node.maintain_order, + node.options, + ) + + +@_translate_ir.register +def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + # Join key dtypes are dependent on the schema of the left and + # right inputs, so these must be translated with the relevant + # input active. + with set_node(visitor, node.input_left): + inp_left = translate_ir(visitor, n=None) + left_on = [translate_expr(visitor, n=e) for e in node.left_on] + with set_node(visitor, node.input_right): + inp_right = translate_ir(visitor, n=None) + right_on = [translate_expr(visitor, n=e) for e in node.right_on] + return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) + + +@_translate_ir.register +def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs] + exprs = [translate_expr(visitor, n=e) for e in node.exprs] + return ir.HStack(schema, inp, cse_exprs, exprs) + + +@_translate_ir.register +def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + exprs = [translate_expr(visitor, n=e) for e in node.expr] + return ir.Reduce(schema, inp, exprs) + + +@_translate_ir.register +def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Distinct( + schema, + translate_ir(visitor, n=node.input), + node.options, + ) + + +@_translate_ir.register +def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + by = [translate_expr(visitor, n=e) for e in node.by_column] + return ir.Sort(schema, inp, by, node.sort_options, node.slice) + + +@_translate_ir.register +def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len) + + +@_translate_ir.register +def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + mask = translate_expr(visitor, n=node.predicate) + return ir.Filter(schema, inp, mask) + + +@_translate_ir.register +def _( + node: pl_ir.SimpleProjection, visitor: Any, schema: dict[str, plc.DataType] +) -> ir.IR: + return ir.Projection(schema, translate_ir(visitor, n=node.input)) + + +@_translate_ir.register +def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + name, *options = node.function + return ir.MapFunction( + schema, + # TODO: merge_sorted breaks this pattern + translate_ir(visitor, n=node.input), + name, + options, + ) + + +@_translate_ir.register +def _(node: pl_ir.Union, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Union( + schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options + ) + + +@_translate_ir.register +def _(node: pl_ir.HConcat, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) + + +@_translate_ir.register +def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.ExtContext( + schema, + translate_ir(visitor, n=node.input), + [translate_ir(visitor, n=n) for n in node.contexts], + ) + + +def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: + """ + Translate a polars-internal IR node to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Optional node to start traversing from, if not provided uses + current polars-internal node. + + Returns + ------- + Translated IR object + + Raises + ------ + NotImplementedError if we can't translate the nodes due to + unsupported functionality. + """ + ctx: AbstractContextManager = ( + set_node(visitor, n) if n is not None else noop_context + ) + with ctx: + node = visitor.view_current_node() + schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()} + return _translate_ir(node, visitor, schema) + + +@singledispatch +def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr: + raise NotImplementedError(f"Translation for {type(node).__name__}") + + +@_translate_expr.register +def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr: + e = translate_expr(visitor, n=node.node) + return expr.NamedExpr(dtype, node.output_name, e) + + +@_translate_expr.register +def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: + name, *options = node.function_data + options = tuple(options) + if isinstance(name, pl_expr.StringFunction): + return expr.StringFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + elif isinstance(name, pl_expr.BooleanFunction): + return expr.BooleanFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + else: + raise NotImplementedError(f"No handler for Expr function node with {name=}") + + +@_translate_expr.register +def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: + # TODO: raise in groupby? + if node.partition_by is None: + return expr.RollingWindow( + dtype, node.options, translate_expr(visitor, n=node.function) + ) + else: + return expr.GroupedRollingWindow( + dtype, + node.options, + translate_expr(visitor, n=node.function), + *(translate_expr(visitor, n=n) for n in node.partition_by), + ) + + +@_translate_expr.register +def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Literal(dtype, node.value) + + +@_translate_expr.register +def _(node: pl_expr.Sort, visitor: Any, dtype: plc.DataType) -> expr.Expr: + # TODO: raise in groupby + return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) + + +@_translate_expr.register +def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.SortBy( + dtype, + node.sort_options, + translate_expr(visitor, n=node.expr), + *(translate_expr(visitor, n=n) for n in node.by), + ) + + +@_translate_expr.register +def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Gather( + dtype, + translate_expr(visitor, n=node.expr), + translate_expr(visitor, n=node.idx), + ) + + +@_translate_expr.register +def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Filter( + dtype, + translate_expr(visitor, n=node.input), + translate_expr(visitor, n=node.by), + ) + + +@_translate_expr.register +def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: + inner = translate_expr(visitor, n=node.expr) + # Push casts into literals so we can handle Cast(Literal(Null)) + if isinstance(inner, expr.Literal): + return expr.Literal(dtype, inner.value) + else: + return expr.Cast(dtype, inner) + + +@_translate_expr.register +def _(node: pl_expr.Column, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Col(dtype, node.name) + + +@_translate_expr.register +def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Agg( + dtype, + node.name, + node.options, + translate_expr(visitor, n=node.arguments), + ) + + +@_translate_expr.register +def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.BinOp( + dtype, + expr.BinOp._MAPPING[node.op], + translate_expr(visitor, n=node.left), + translate_expr(visitor, n=node.right), + ) + + +@_translate_expr.register +def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Len(dtype) + + +def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: + """ + Translate a polars-internal expression IR into our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Node to translate, either an integer referencing a polars + internal node, or a named expression node. + + Returns + ------- + Translated IR object. + + Raises + ------ + NotImplementedError if any translation fails due to unsupported functionality. + """ + if isinstance(n, pl_expr.PyExprIR): + # TODO: type narrowing doesn't rule out int since PyExprIR is Unknown + assert not isinstance(n, int) + node = n + dtype = dtypes.from_polars(visitor.get_dtype(node.node)) + else: + node = visitor.view_expression(n) + dtype = dtypes.from_polars(visitor.get_dtype(n)) + return _translate_expr(node, visitor, dtype) diff --git a/python/cudf_polars/cudf_polars/testing/__init__.py b/python/cudf_polars/cudf_polars/testing/__init__.py new file mode 100644 index 00000000000..d0147e713f9 --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Testing utilities for cudf_polars.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py new file mode 100644 index 00000000000..a6e26a6425c --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Device-aware assertions.""" + +from __future__ import annotations + +from functools import partial +from typing import TYPE_CHECKING + +from polars.testing.asserts import assert_frame_equal + +from cudf_polars.callback import execute_with_cudf + +if TYPE_CHECKING: + import polars as pl + +__all__: list[str] = ["assert_gpu_result_equal"] + + +def assert_gpu_result_equal( + lazydf: pl.LazyFrame, + *, + check_row_order: bool = True, + check_column_order: bool = True, + check_dtype: bool = True, + check_exact: bool = True, + rtol: float = 1e-05, + atol: float = 1e-08, + categorical_as_str: bool = False, +): + """ + Assert that collection of a lazyframe on GPU produces correct results. + + Parameters + ---------- + lazydf + frame to collect. + check_row_order + Expect rows to be in same order + check_column_order + Expect columns to be in same order + check_dtype + Expect dtypes to match + check_exact + Require exact equality for floats, if `False` compare using + rtol and atol. + rtol + Relative tolerance for float comparisons + atol + Absolute tolerance for float comparisons + categorical_as_str + Decat categoricals to strings before comparing + + Raises + ------ + AssertionError + If the GPU and CPU collection do not match. + NotImplementedError + If GPU collection failed in some way. + """ + expect = lazydf.collect() + got = lazydf.collect( + post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) + ) + assert_frame_equal( + expect, + got, + check_row_order=check_row_order, + check_column_order=check_column_order, + check_dtype=check_dtype, + check_exact=check_exact, + rtol=rtol, + atol=atol, + categorical_as_str=categorical_as_str, + ) diff --git a/python/cudf_polars/cudf_polars/utils/__init__.py b/python/cudf_polars/cudf_polars/utils/__init__.py new file mode 100644 index 00000000000..6018209e1e8 --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Utilities.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py new file mode 100644 index 00000000000..51379433c03 --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Datatype utilities.""" + +from __future__ import annotations + +from functools import cache + +from typing_extensions import assert_never + +import polars as pl + +import cudf._lib.pylibcudf as plc + + +@cache +def from_polars(dtype: pl.DataType) -> plc.DataType: + """ + Convert a polars datatype to a pylibcudf one. + + Parameters + ---------- + dtype + Polars dtype to convert + + Returns + ------- + Matching pylibcudf DataType object. + + Raises + ------ + NotImplementedError for unsupported conversions. + """ + if isinstance(dtype, pl.Boolean): + return plc.DataType(plc.TypeId.BOOL8) + elif isinstance(dtype, pl.Int8): + return plc.DataType(plc.TypeId.INT8) + elif isinstance(dtype, pl.Int16): + return plc.DataType(plc.TypeId.INT16) + elif isinstance(dtype, pl.Int32): + return plc.DataType(plc.TypeId.INT32) + elif isinstance(dtype, pl.Int64): + return plc.DataType(plc.TypeId.INT64) + if isinstance(dtype, pl.UInt8): + return plc.DataType(plc.TypeId.UINT8) + elif isinstance(dtype, pl.UInt16): + return plc.DataType(plc.TypeId.UINT16) + elif isinstance(dtype, pl.UInt32): + return plc.DataType(plc.TypeId.UINT32) + elif isinstance(dtype, pl.UInt64): + return plc.DataType(plc.TypeId.UINT64) + elif isinstance(dtype, pl.Float32): + return plc.DataType(plc.TypeId.FLOAT32) + elif isinstance(dtype, pl.Float64): + return plc.DataType(plc.TypeId.FLOAT64) + elif isinstance(dtype, pl.Date): + return plc.DataType(plc.TypeId.TIMESTAMP_DAYS) + elif isinstance(dtype, pl.Time): + raise NotImplementedError("Time of day dtype not implemented") + elif isinstance(dtype, pl.Datetime): + if dtype.time_zone is not None: + raise NotImplementedError("Time zone support") + if dtype.time_unit == "ms": + return plc.DataType(plc.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype.time_unit == "us": + return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + elif dtype.time_unit == "ns": + return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS) + assert dtype.time_unit is not None + assert_never(dtype.time_unit) + elif isinstance(dtype, pl.Duration): + if dtype.time_unit == "ms": + return plc.DataType(plc.TypeId.DURATION_MILLISECONDS) + elif dtype.time_unit == "us": + return plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + elif dtype.time_unit == "ns": + return plc.DataType(plc.TypeId.DURATION_NANOSECONDS) + assert dtype.time_unit is not None + assert_never(dtype.time_unit) + elif isinstance(dtype, pl.String): + return plc.DataType(plc.TypeId.STRING) + elif isinstance(dtype, pl.Null): + # TODO: Hopefully + return plc.DataType(plc.TypeId.EMPTY) + elif isinstance(dtype, pl.List): + return plc.DataType(plc.TypeId.LIST) + else: + raise NotImplementedError(f"{dtype=} conversion not supported") diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py new file mode 100644 index 00000000000..b3ecfdd3dd4 --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/sorting.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Sorting utilities.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import cudf._lib.pylibcudf as plc + +if TYPE_CHECKING: + from collections.abc import Sequence + + +def sort_order( + descending: Sequence[bool], *, nulls_last: bool, num_keys: int +) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: + """ + Produce sort order arguments. + + Parameters + ---------- + descending + List indicating order for each column + nulls_last + Should nulls sort last or first? + num_keys + Number of sort keys + + Returns + ------- + tuple of column_order and null_precendence + suitable for passing to sort routines + """ + # Mimicking polars broadcast handling of descending + if num_keys > (n := len(descending)) and n == 1: + descending = [descending[0]] * num_keys + column_order = [ + plc.types.Order.DESCENDING if d else plc.types.Order.ASCENDING + for d in descending + ] + null_precedence = [] + for asc in column_order: + if (asc == plc.types.Order.ASCENDING) ^ (not nulls_last): + null_precedence.append(plc.types.NullOrder.AFTER) + elif (asc == plc.types.Order.ASCENDING) ^ nulls_last: + null_precedence.append(plc.types.NullOrder.BEFORE) + return column_order, null_precedence diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md new file mode 100644 index 00000000000..cbf012f5881 --- /dev/null +++ b/python/cudf_polars/docs/overview.md @@ -0,0 +1,174 @@ +# Getting started + +You will need: + +1. Rust development environment. If you use the rapids [combined + devcontainer](https://github.com/rapidsai/devcontainers/), add + `"./features/src/rust": {"version": "latest", "profile": "default"},` to your + preferred configuration. Or else, use + [rustup](https://www.rust-lang.org/tools/install) +2. A [cudf development + environment](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment). + The combined devcontainer works, or whatever your favourite approach is. + +> ![NOTE] These instructions will get simpler as we merge code in. + +## Installing polars + +We will need to build polars from source. Until things settle down, +live at `HEAD`. + +```sh +git clone https://github.com/pola-rs/polars +cd polars +``` + +We will install build dependencies in the same environment that we created for +building cudf. Note that polars offers a `make build` command that sets up a +separate virtual environment, but we don't want to do that right now. So in the +polars clone: + +```sh +# cudf environment (conda or pip) is active +pip install --upgrade uv +uv pip install --upgrade -r py-polars/requirements-dev.txt +``` + +Now we have the necessary machinery to build polars +```sh +cd py-polars +# build in debug mode, best option for development/debugging +maturin develop -m Cargo.toml +``` + +For benchmarking purposes we should build in release mode +```sh +RUSTFLAGS='-C target-cpu=native' maturin develop -m Cargo.toml --release +``` + +After any update of the polars code, we need to rerun the `maturin` build +command. + +## Installing the cudf polars executor + +The executor for the polars logical plan lives in the cudf repo, in +`python/cudf_polars`. Build cudf as normal and then install the +`cudf_polars` package in editable mode: + +```sh +cd cudf/python/cudf_polars +pip install --no-deps -e . +``` + +You should now be able to run the tests in the `cudf_polars` package: +```sh +pytest -v tests +``` + +# Executor design + +The polars `LazyFrame.collect` functionality offers a +"post-optimization" callback that may be used by a third party library +to replace a node (or more, though we only replace a single node) in the +optimized logical plan with a Python callback that is to deliver the +result of evaluating the plan. This splits the execution of the plan +into two phases. First, a symbolic phase which translates to our +internal representation (IR). Second, an execution phase which executes +using our IR. + +The translation phase receives the a low-level Rust `NodeTraverse` +object which delivers Python representations of the plan nodes (and +expressions) one at a time. During translation, we endeavour to raise +`NotImplementedError` for any unsupported functionality. This way, if +we can't execute something, we just don't modify the logical plan at +all: if we can translate the IR, it is assumed that evaluation will +later succeed. + +The usage of the cudf-based executor is therefore, at present: + +```python +from cudf_polars.callback import execute_with_cudf + +result = q.collect(post_opt_callback=execute_with_cudf) +``` + +This should either transparently run on the GPU and deliver a polars +dataframe, or else fail (but be handled) and just run the normal CPU +execution. + +## Adding a handler for a new plan node + +Plan node definitions live in `cudf_polars/dsl/ir.py`, these are +`dataclasses` that inherit from the base `IR` node. The evaluation of +a plan node is done by implementing the `evaluate` method. + +To translate the plan node, add a case handler in `translate_ir` which +lives in `cudf_polars/dsl/translate.py`. + +As well as child nodes that are plans, most plan nodes contain child +expressions, which should be transformed using the input to the plan as a +context. The translation of expressions is handled via +`translate_expr` in `cudf_polars/dsl/translate.py`. So that data-type +resolution is performed correctly any expression should be translated +with the correct plan node "active" in the visitor. For example, when +translating a `Join` node, the left keys (expressions) should be +translated with the left input active (and right keys with right +input). To facilitate this, use the `set_node` context manager. + +## Adding a handler for a new expression node + +Adding a handle for an expression node is very similar to a plan node. +Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit +from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it +is simpler for us to implement efficient hashing, repr, and equality if we +can write that ourselves. + +Every expression consists of two types of data: +1. child data (other `Expr`s) +2. non-child data (anything other than an `Expr`) +The generic implementations of special methods in the base `Expr` base +class require that the subclasses advertise which arguments to the +constructor are non-child in a `_non_child` class slot. The +constructor should then take arguments: +```python +def __init__(self, *non_child_data: Any, *children: Expr): +``` +Read the docstrings in the `Expr` class for more details. + +Expressions are evaluated by implementing a `do_evaluate` method that +takes a `DataFrame` as context (this provides columns) along with an +`ExecutionContext` parameter (indicating what context we're evaluating +this expression in, currently unused) and a `mapping` from +expressions to evaluated `Column`s. This approach enables a simple form of +expression rewriting during evaluation of expressions that is used in +evaluation of, for example, groupby-aggregations. To perform the +evaluation, one should use the base class (generic) `evaluate` method +which handles the boilerplate for looking up in the substitution +`mapping`. + +To simplify state tracking, all columns should be considered immutable +on construction. This matches the "functional" description coming from +the logical plan in any case, so is reasonably natural. + +# Containers + +Containers should be constructed as relatively lightweight objects +around their pylibcudf counterparts. We have three (in +`cudf_polars/containers/`): + +1. Scalar (a wrapper around a pylibcudf Scalar) +2. Column (a wrapper around a pylibcudf Column) +3. DataFrame (a wrapper around a pylibcudf Table) + +The interfaces offered by these are somewhat in flux, but broadly +speaking, a `DataFrame` is just a list of `Column`s which each hold +data plus a string `name`, along with a collection of `Scalar`s (this +might go away). + +The columns keep track of metadata (for example, whether or not they +are sorted). + +We offer some utility methods for transferring metadata when +constructing new dataframes and columns, both `DataFrame` and `Column` +offer a `with_metadata(*, like: Self)` call which copies metadata from +the template. diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 86b0ad414fd..49ecd7080b9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -20,7 +20,7 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "cudf==24.8.*,>=0.0.0a0", - "polars>=0.20.24", + "polars>=0.20.30", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -52,6 +52,9 @@ version = {file = "cudf_polars/VERSION"} [tool.setuptools.packages.find] exclude = ["*tests*"] +[tool.pytest.ini_options] +xfail_strict = true + [tool.ruff] line-length = 88 indent-width = 4 @@ -130,6 +133,9 @@ ignore = [ ] fixable = ["ALL"] +[tool.ruff.lint.per-file-ignores] +"**/tests/**/test_*.py" = ["D", "INP"] + [tool.ruff.lint.flake8-pytest-style] # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style fixture-parentheses = false @@ -175,3 +181,5 @@ docstring-code-format = true build-backend = "setuptools.build_meta" commit-file = "cudf_polars/GIT_COMMIT" dependencies-file = "../../dependencies.yaml" +# Pure python +disable-cuda = true diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py new file mode 100644 index 00000000000..c792ae64f74 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.dsl import expr +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=sorted(expr.Agg._SUPPORTED)) +def agg(request): + return request.param + + +@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16]) +def dtype(request): + return request.param + + +@pytest.fixture(params=[False, True], ids=["no-nulls", "with-nulls"]) +def with_nulls(request): + return request.param + + +@pytest.fixture( + params=[ + False, + pytest.param(True, marks=pytest.mark.xfail(reason="No handler for set_sorted")), + ], + ids=["unsorted", "sorted"], +) +def is_sorted(request): + return request.param + + +@pytest.fixture +def df(dtype, with_nulls, is_sorted): + values = [-10, 4, 5, 2, 3, 6, 8, 9, 4, 4, 5, 2, 3, 7, 3, 6, -10, -11] + if with_nulls: + values = [None if v % 5 == 0 else v for v in values] + + if is_sorted: + values = sorted(values, key=lambda x: -1000 if x is None else x) + + df = pl.LazyFrame({"a": values}, schema={"a": dtype}) + if is_sorted: + return df.set_sorted("a") + return df + + +def test_agg(df, agg): + expr = getattr(pl.col("a"), agg)() + q = df.select(expr) + + # https://github.com/rapidsai/cudf/issues/15852 + check_dtype = agg not in {"count", "n_unique", "median"} + if not check_dtype and q.schema["a"] != pl.Float64: + with pytest.raises(AssertionError): + assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_dtype=check_dtype, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_filter.py b/python/cudf_polars/tests/expressions/test_filter.py new file mode 100644 index 00000000000..783403d764c --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_filter.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_filter(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + # group-by is just to avoid the filter being pushed into the scan. + query = ldf.group_by(pl.col("a")).agg(pl.col("b").sum()).filter(pl.col("b") < 1) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py new file mode 100644 index 00000000000..df33e19a0b6 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_gather.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_gather(): + ldf = pl.LazyFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [0, 3, 1, 5, 6, 1, 0], + } + ) + + query = ldf.select(pl.col("a").gather(pl.col("b"))) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py new file mode 100644 index 00000000000..548aebf0875 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + +dtypes = [ + pl.Int8, + pl.Int16, + pl.Int64, + pl.UInt8, + pl.UInt64, + pl.Float32, + pl.Float64, +] + + +@pytest.fixture(params=dtypes) +def ltype(request): + return request.param + + +@pytest.fixture(params=dtypes) +def rtype(request): + return request.param + + +@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"]) +def with_nulls(request): + return request.param + + +@pytest.fixture( + params=[ + pl.Expr.eq, + pl.Expr.eq_missing, + pl.Expr.ne, + pl.Expr.ne_missing, + pl.Expr.lt, + pl.Expr.le, + pl.Expr.gt, + pl.Expr.ge, + pl.Expr.add, + pl.Expr.sub, + pl.Expr.mul, + pl.Expr.truediv, + pl.Expr.floordiv, + pl.Expr.mod, + ], + ids=lambda fn: fn.__name__, +) +def binop(request): + return request.param + + +@pytest.fixture +def df(request, ltype, rtype, with_nulls, binop): + a = [1, 2, 3, 5, 8] + if with_nulls: + a[2] = None + a[-1] = None + b = [10, 20, 30, 50, 0] + if with_nulls: + b[1] = None + b[3] = None + b[-1] = None + + lkind = ( + "i" + if ltype.is_signed_integer() + else ("u" if ltype.is_unsigned_integer() else "f") + ) + rkind = ( + "i" + if rtype.is_signed_integer() + else ("u" if rtype.is_unsigned_integer() else "f") + ) + if ( + not with_nulls + and binop.__name__ in {"floordiv", "mod"} + # This catches the case where the result is not promoted to float. + and ( + (lkind == rkind and lkind in {"i", "u"}) + or ({lkind, rkind} == {"i", "u"} and pl.UInt64 not in {ltype, rtype}) + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="Polars nullifies division by zero for integral types" + ) + ) + + return pl.LazyFrame({"a": a, "b": b}, schema={"a": ltype, "b": rtype}) + + +def test_numeric_binop(df, binop): + left = pl.col("a") + right = pl.col("b") + + q = df.select(binop(left, right)) + + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_distinct.py b/python/cudf_polars/tests/test_distinct.py new file mode 100644 index 00000000000..d42c4a96f5a --- /dev/null +++ b/python/cudf_polars/tests/test_distinct.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("subset", [None, ["a"], ["a", "b"], ["b", "c"], ["c", "a"]]) +@pytest.mark.parametrize("keep", ["any", "none", "first", "last"]) +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"]) +def test_distinct(subset, keep, maintain_order, pre_sorted): + ldf = pl.DataFrame( + { + "a": [1, 2, 1, 3, 5, None, None], + "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + "c": [True, True, True, True, False, False, True], + } + ).lazy() + if pre_sorted: + keys = ["a", "b", "c"] if subset is None else subset + descending = False if len(keys) == 1 else [False, True, True][: len(keys)] + ldf = ldf.sort(*keys, descending=descending) + + query = ldf.unique(subset=subset, keep=keep, maintain_order=maintain_order) + assert_gpu_result_equal(query, check_row_order=maintain_order) diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py new file mode 100644 index 00000000000..9daf88b4338 --- /dev/null +++ b/python/cudf_polars/tests/test_extcontext.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_extcontext(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c")) + query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c")) + with pytest.raises(pl.exceptions.ComputeError): + # ExtContext to be deprecated so we're not implementing it. + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py new file mode 100644 index 00000000000..d06a7ecf105 --- /dev/null +++ b/python/cudf_polars/tests/test_groupby.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture +def df(): + return pl.LazyFrame( + { + "key1": [1, 1, 1, 2, 3, 1, 4, 6, 7], + "key2": [2, 2, 2, 2, 6, 1, 4, 6, 8], + "int": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "float": [7.0, 1, 2, 3, 4, 5, 6, 7, 8], + } + ) + + +@pytest.fixture( + params=[ + ["key1"], + ["key2"], + [pl.col("key1") * pl.col("key2")], + ["key1", "key2"], + [pl.col("key1") == pl.col("key2")], + ["key2", pl.col("key1") == pl.lit(1, dtype=pl.Int64)], + ], + ids=lambda keys: "-".join(map(str, keys)), +) +def keys(request): + return request.param + + +@pytest.fixture( + params=[ + ["int"], + ["float", "int"], + [pl.col("float") + pl.col("int")], + [pl.col("float").max() - pl.col("int").min()], + [pl.col("float").mean(), pl.col("int").std()], + ], + ids=lambda aggs: "-".join(map(str, aggs)), +) +def exprs(request): + return request.param + + +@pytest.fixture( + params=[ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="Maintaining order in groupby not implemented" + ), + ), + ], + ids=["no_maintain_order", "maintain_order"], +) +def maintain_order(request): + return request.param + + +def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs): + q = df.group_by(*keys, maintain_order=maintain_order).agg(*exprs) + + if not maintain_order: + sort_keys = list(q.schema.keys())[: len(keys)] + q = q.sort(*sort_keys) + # from cudf_polars.dsl.translate import translate_ir + # ir = translate_ir(q._ldf.visit()) + # from IPython import embed; embed() + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/test_hconcat.py b/python/cudf_polars/tests/test_hconcat.py new file mode 100644 index 00000000000..46cbb21b25a --- /dev/null +++ b/python/cudf_polars/tests/test_hconcat.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_hconcat(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c")) + query = pl.concat([ldf, ldf2], how="horizontal") + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_hstack.py b/python/cudf_polars/tests/test_hstack.py new file mode 100644 index 00000000000..b8c97f4607f --- /dev/null +++ b/python/cudf_polars/tests/test_hstack.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_hstack(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ldf.with_columns(pl.col("a") + pl.col("b")) + assert_gpu_result_equal(query) + + +def test_hstack_with_cse(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + expr = pl.col("a") + pl.col("b") + query = ldf.with_columns(expr.alias("c"), expr.alias("d") * 2) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py new file mode 100644 index 00000000000..f4a4704f3cc --- /dev/null +++ b/python/cudf_polars/tests/test_join.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "how", + [ + "inner", + "left", + "semi", + "anti", + pytest.param( + "cross", + marks=pytest.mark.xfail(reason="cross join not implemented"), + ), + "full", + ], +) +@pytest.mark.parametrize("coalesce", [False, True]) +@pytest.mark.parametrize( + "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"] +) +@pytest.mark.parametrize( + "join_expr", + [ + pl.col("a"), + pl.col("a") * 2, + [pl.col("a"), pl.col("c") + 1], + ["c", "a"], + ], +) +def test_join(how, coalesce, join_nulls, join_expr): + left = pl.DataFrame( + { + "a": [1, 2, 3, 1, None], + "b": [1, 2, 3, 4, 5], + "c": [2, 3, 4, 5, 6], + } + ).lazy() + right = pl.DataFrame( + { + "a": [1, 4, 3, 7, None, None], + "c": [2, 3, 4, 5, 6, 7], + } + ).lazy() + + query = left.join( + right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce + ) + assert_gpu_result_equal(query, check_row_order=False) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py new file mode 100644 index 00000000000..b75e1bdef10 --- /dev/null +++ b/python/cudf_polars/tests/test_scan.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture( + params=[ + (None, None), + pytest.param( + ("row-index", 0), + marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), + ), + pytest.param( + ("index", 10), + marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), + ), + ], + ids=["no-row-index", "zero-offset-row-index", "offset-row-index"], +) +def row_index(request): + return request.param + + +@pytest.fixture( + params=[ + (None, 0), + pytest.param( + (2, 1), marks=pytest.mark.xfail(reason="No handling of row limit in scan") + ), + pytest.param( + (3, 0), marks=pytest.mark.xfail(reason="No handling of row limit in scan") + ), + ], + ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"], +) +def n_rows_skip_rows(request): + return request.param + + +@pytest.fixture(params=["csv", "parquet"]) +def df(request, tmp_path, row_index, n_rows_skip_rows): + df = pl.DataFrame( + { + "a": [1, 2, 3, None], + "b": ["ẅ", "x", "y", "z"], + "c": [None, None, 4, 5], + } + ) + name, offset = row_index + n_rows, skip_rows = n_rows_skip_rows + if request.param == "csv": + df.write_csv(tmp_path / "file.csv") + return pl.scan_csv( + tmp_path / "file.csv", + row_index_name=name, + row_index_offset=offset, + skip_rows_after_header=skip_rows, + n_rows=n_rows, + ) + else: + df.write_parquet(tmp_path / "file.pq") + # parquet doesn't have skip_rows argument + return pl.scan_parquet( + tmp_path / "file.pq", + row_index_name=name, + row_index_offset=offset, + n_rows=n_rows, + ) + + +@pytest.fixture(params=[None, ["a"], ["b", "a"]], ids=["all", "subset", "reordered"]) +def columns(request, row_index): + name, _ = row_index + if name is not None and request.param is not None: + return [*request.param, name] + return request.param + + +@pytest.fixture( + params=[None, pl.col("c").is_not_null()], ids=["no-mask", "c-is-not-null"] +) +def mask(request): + return request.param + + +def test_scan(df, columns, mask): + q = df + if mask is not None: + q = q.filter(mask) + if columns is not None: + q = df.select(*columns) + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py new file mode 100644 index 00000000000..503edef152e --- /dev/null +++ b/python/cudf_polars/tests/test_select.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_select(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ldf.select( + pl.col("a") + pl.col("b"), (pl.col("a") * 2 + pl.col("b")).alias("d") + ) + + assert_gpu_result_equal(query) + + +def test_select_reduce(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ldf.select( + (pl.col("a") + pl.col("b")).max(), + (pl.col("a") * 2 + pl.col("b")).alias("d").mean(), + ) + + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py new file mode 100644 index 00000000000..d27e91302ba --- /dev/null +++ b/python/cudf_polars/tests/test_slice.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "offset", + [0, 1, 2], +) +@pytest.mark.parametrize( + "len", + [0, 2, 12], +) +def test_slice(offset, len): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ( + ldf.group_by(pl.col("a")) + .agg(pl.col("b").sum()) + .sort(by=pl.col("a")) + .slice(offset, len) + ) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py new file mode 100644 index 00000000000..ecc02efd967 --- /dev/null +++ b/python/cudf_polars/tests/test_sort.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "sort_keys", + [ + (pl.col("a"),), + pytest.param( + (pl.col("d").abs(),), + marks=pytest.mark.xfail(reason="abs not yet implemented"), + ), + (pl.col("a"), pl.col("d")), + (pl.col("b"),), + ], +) +@pytest.mark.parametrize("nulls_last", [False, True]) +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +def test_sort(sort_keys, nulls_last, maintain_order): + ldf = pl.DataFrame( + { + "a": [1, 2, 1, 3, 5, None, None], + "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + "c": [True, True, True, True, False, False, True], + "d": [1, 2, -1, 10, 6, -1, -7], + } + ).lazy() + + query = ldf.sort( + *sort_keys, + descending=True, + nulls_last=nulls_last, + maintain_order=maintain_order, + ) + assert_gpu_result_equal(query, check_row_order=maintain_order) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py new file mode 100644 index 00000000000..2c85bb15a55 --- /dev/null +++ b/python/cudf_polars/tests/test_union.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.xfail(reason="Need handling of null scalars that are cast") +def test_union(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a")) + query = pl.concat([ldf, ldf2], how="diagonal") + # Plan for this produces a `None`.astype(Int64) which we don't + # handle correctly right now + assert_gpu_result_equal(query) + + +def test_concat_vertical(): + ldf = pl.LazyFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ) + ldf2 = ldf.select(pl.col("a"), pl.col("b") * 2 + pl.col("a")) + q = pl.concat([ldf, ldf2], how="vertical") + + assert_gpu_result_equal(q) From c268fc106169ae4d2fb4a78125cce724d1ee45b6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 30 May 2024 09:58:21 -0500 Subject: [PATCH 022/340] Update `pylibcudf` testing utilities (#15772) Cleans up some testing utilities for pylibcudf as suggested in https://github.com/rapidsai/cudf/pull/15418#discussion_r1603669456. Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15772 --- .../cudf/cudf/pylibcudf_tests/common/utils.py | 42 +++++++++++++------ .../test_column_from_device.py | 2 +- .../cudf/cudf/pylibcudf_tests/test_copying.py | 14 +++---- .../cudf/pylibcudf_tests/test_string_case.py | 6 +-- .../cudf/pylibcudf_tests/test_string_find.py | 18 ++++---- 5 files changed, 49 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 596cd2c92ae..0befb3bb3e8 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -1,6 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from typing import Optional +from typing import Optional, Union import pyarrow as pa import pytest @@ -24,27 +24,43 @@ def metadata_from_arrow_array( return metadata -def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: - """Verify that the pylibcudf array and PyArrow array are equal.""" +def assert_column_eq( + lhs: Union[pa.Array, plc.Column], rhs: Union[pa.Array, plc.Column] +) -> None: + """Verify that a pylibcudf array and PyArrow array are equal.""" # Nested types require children metadata to be passed to the conversion function. - plc_pa = plc.interop.to_arrow( - plc_column, metadata=metadata_from_arrow_array(pa_array) - ) + if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance( + rhs, plc.Column + ): + rhs = plc.interop.to_arrow( + rhs, metadata=metadata_from_arrow_array(lhs) + ) + elif isinstance(lhs, plc.Column) and isinstance( + rhs, (pa.Array, pa.ChunkedArray) + ): + lhs = plc.interop.to_arrow( + lhs, metadata=metadata_from_arrow_array(rhs) + ) + else: + raise ValueError( + "One of the inputs must be a Column and the other an Array" + ) + + if isinstance(lhs, pa.ChunkedArray): + lhs = lhs.combine_chunks() + if isinstance(rhs, pa.ChunkedArray): + rhs = rhs.combine_chunks() - if isinstance(plc_pa, pa.ChunkedArray): - plc_pa = plc_pa.combine_chunks() - if isinstance(pa_array, pa.ChunkedArray): - pa_array = pa_array.combine_chunks() - assert plc_pa.equals(pa_array) + assert lhs.equals(rhs) def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None: - """Verify that the pylibcudf array and PyArrow array are equal.""" + """Verify that a pylibcudf table and PyArrow table are equal.""" plc_shape = (plc_table.num_rows(), plc_table.num_columns()) assert plc_shape == pa_table.shape for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): - assert_column_eq(plc_col, pa_col) + assert_column_eq(pa_col, plc_col) def cudf_raises(expected_exception: BaseException, *args, **kwargs): diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py index 764720d9de1..c4ff7bb43a5 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py +++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py @@ -48,4 +48,4 @@ def test_from_cuda_array_interface(valid_column): ) expect = valid_column - assert_column_eq(col, expect) + assert_column_eq(expect, col) diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index 0bf30f98636..ef70869a145 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -409,7 +409,7 @@ def test_copy_range_in_place( ), pa_target_column, ) - assert_column_eq(mutable_target_column, expected) + assert_column_eq(expected, mutable_target_column) def test_copy_range_in_place_out_of_bounds( @@ -480,7 +480,7 @@ def test_copy_range( ), pa_target_column, ) - assert_column_eq(result, expected) + assert_column_eq(expected, result) else: with pytest.raises(TypeError): plc.copying.copy_range( @@ -528,7 +528,7 @@ def test_shift( expected = pa.concat_arrays( [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] ) - assert_column_eq(result, expected) + assert_column_eq(expected, result) else: with pytest.raises(TypeError): plc.copying.shift(target_column, shift, source_scalar) @@ -550,7 +550,7 @@ def test_slice_column(target_column, pa_target_column): lower_bounds = bounds[::2] result = plc.copying.slice(target_column, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): - assert_column_eq(slice_, pa_target_column[lb:ub]) + assert_column_eq(pa_target_column[lb:ub], slice_) def test_slice_column_wrong_length(target_column): @@ -582,7 +582,7 @@ def test_split_column(target_column, pa_target_column): lower_bounds = [0] + upper_bounds[:-1] result = plc.copying.split(target_column, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): - assert_column_eq(split, pa_target_column[lb:ub]) + assert_column_eq(pa_target_column[lb:ub], split) def test_split_column_decreasing(target_column): @@ -622,7 +622,7 @@ def test_copy_if_else_column_column( pa_target_column, pa_other_column, ) - assert_column_eq(result, expected) + assert_column_eq(expected, result) def test_copy_if_else_wrong_type(target_column, mask): @@ -699,7 +699,7 @@ def test_copy_if_else_column_scalar( pa_mask, *pa_args, ) - assert_column_eq(result, expected) + assert_column_eq(expected, result) def test_boolean_mask_scatter_from_table( diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py index ae01d953df5..1039859b2cf 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_case.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py @@ -18,18 +18,18 @@ def test_to_upper(string_col): plc_col = plc.interop.from_arrow(string_col) got = plc.strings.case.to_upper(plc_col) expected = pa.compute.utf8_upper(string_col) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_to_lower(string_col): plc_col = plc.interop.from_arrow(string_col) got = plc.strings.case.to_lower(plc_col) expected = pa.compute.utf8_lower(string_col) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_swapcase(string_col): plc_col = plc.interop.from_arrow(string_col) got = plc.strings.case.swapcase(plc_col) expected = pa.compute.utf8_swapcase(string_col) - assert_column_eq(got, expected) + assert_column_eq(expected, got) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py index f44c4af9bfc..44900044184 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_find.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py @@ -134,7 +134,7 @@ def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): type=pa.int32(), ) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def colwise_apply(pa_data_col, pa_target_col, operator): @@ -174,7 +174,7 @@ def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col): ) got = plc.strings.find.find(plc_data_col, plc_target_col, 0) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): @@ -192,7 +192,7 @@ def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): type=pa.int32(), ) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_contains( @@ -211,7 +211,7 @@ def test_contains( type=pa.bool_(), ) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_contains_column( @@ -221,7 +221,7 @@ def test_contains_column( pa_data_col, pa_target_col, lambda st, target: target in st ) got = plc.strings.find.contains(plc_data_col, plc_target_col) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_starts_with( @@ -230,7 +230,7 @@ def test_starts_with( py_target = pa_target_scalar.as_py() got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar) expected = pa.compute.starts_with(pa_data_col, py_target) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_starts_with_column( @@ -240,7 +240,7 @@ def test_starts_with_column( pa_data_col, pa_target_col, lambda st, target: st.startswith(target) ) got = plc.strings.find.starts_with(plc_data_col, plc_target_col) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_ends_with( @@ -249,7 +249,7 @@ def test_ends_with( py_target = pa_target_scalar.as_py() got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar) expected = pa.compute.ends_with(pa_data_col, py_target) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_ends_with_column( @@ -259,4 +259,4 @@ def test_ends_with_column( pa_data_col, pa_target_col, lambda st, target: st.endswith(target) ) got = plc.strings.find.ends_with(plc_data_col, plc_target_col) - assert_column_eq(got, expected) + assert_column_eq(expected, got) From 579a167542ce664bb9d28ae6b5419e524ec5288b Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 30 May 2024 18:37:56 +0200 Subject: [PATCH 023/340] Simple NumPy 2 fixes that are clearly no behavior change (#15876) I have a branch that works, but some changes may need a bit of thought to get right, so splitting out the simpler half. (N.B. the only bigger chunk that is remaining is to make sure that `uint_series > -1` keeps working at least as well as before) In either case, these are changes that: * Avoid `copy=False` in `np.array()` * Are necessary due to NumPy rejecting e.g. `uint8(-1)` now (only changed this where it is test-only) * Are necessary due to NumPy preserving the scalar dtype things fail later (the hashing code and using `float(float32)` to avoid overflow. * Sorting change is the same, using `int8(-1)` gives effectively the old promotion (to float) rather than erroring to not implicit go to float based on the value. The main noise, is that I parametrized that one test since it seemed easy enough. Authors: - Sebastian Berg (https://github.com/seberg) Approvers: - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15876 --- python/cudf/cudf/core/buffer/buffer.py | 4 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 4 +- python/cudf/cudf/tests/test_hash_vocab.py | 8 ++- python/cudf/cudf/tests/test_numerical.py | 2 +- python/cudf/cudf/tests/test_replace.py | 51 +++++-------------- python/cudf/cudf/tests/test_sorting.py | 3 +- python/cudf/cudf/utils/hash_vocab_utils.py | 25 ++++----- 7 files changed, 37 insertions(+), 60 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 5c2d77033b8..bf6f9f1a3c1 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -191,7 +191,7 @@ def from_host_memory(cls, data: Any) -> Self: """Create an owner from a buffer or array like object Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.array()` + be convertible to a buffer object using `numpy.asanyarray()` The host memory is copied to a new device allocation. @@ -209,7 +209,7 @@ def from_host_memory(cls, data: Any) -> Self: """ # Convert to numpy array, this will not copy data in most cases. - ary = numpy.array(data, copy=False, subok=True) + ary = numpy.asanyarray(data) # Extract pointer and size ptr, size = get_ptr_and_size(ary.__array_interface__) # Copy to device memory diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index a1af3ba8c9d..49258fea9ab 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -146,7 +146,7 @@ def from_host_memory(cls, data: Any) -> Self: """Create a spillabe buffer from host memory. Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.array()` + be convertible to a buffer object using `numpy.asanyarray()` The new buffer is marked as spilled to host memory already. @@ -165,7 +165,7 @@ def from_host_memory(cls, data: Any) -> Self: # Convert to a memoryview using numpy array, this will not copy data # in most cases. - data = memoryview(numpy.array(data, copy=False, subok=True)) + data = memoryview(numpy.asanyarray(data)) if not data.c_contiguous: raise ValueError("Buffer data must be C-contiguous") data = data.cast("B") # Make sure itemsize==1 diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py index e081119ff89..c98b92f7083 100644 --- a/python/cudf/cudf/tests/test_hash_vocab.py +++ b/python/cudf/cudf/tests/test_hash_vocab.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import filecmp import os import warnings @@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir): groundtruth_path = os.path.join(datadir, "vocab-hash.txt") output_path = tmpdir.join("cudf-vocab-hash.txt") - with warnings.catch_warnings(): - # See https://github.com/rapidsai/cudf/issues/12403 - warnings.simplefilter(action="ignore", category=RuntimeWarning) - hash_vocab(vocab_path, output_path) + warnings.simplefilter(action="ignore", category=RuntimeWarning) + hash_vocab(vocab_path, output_path) assert filecmp.cmp(output_path, groundtruth_path, shallow=False) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 2e3be92dbeb..03081208739 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -44,7 +44,7 @@ def test_can_cast_safely_same_kind(): assert data.can_cast_safely(to_dtype) data = cudf.Series( - [np.finfo("float32").max * 2, 1.0], dtype="float64" + [float(np.finfo("float32").max) * 2, 1.0], dtype="float64" )._column to_dtype = np.dtype("float32") assert not data.can_cast_safely(to_dtype) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 8992c4d617b..d77ec596271 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +import operator import re from decimal import Decimal @@ -825,43 +826,23 @@ def test_series_fillna_invalid_dtype(data_dtype): @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) @pytest.mark.parametrize("fill_value", [100, 100.0, 128.5]) -def test_series_where(data_dtype, fill_value): +@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt]) +def test_series_where(data_dtype, fill_value, op): psr = pd.Series(list(range(10)), dtype=data_dtype) sr = cudf.from_pandas(psr) - if sr.dtype.type(fill_value) != fill_value: - with pytest.raises(TypeError): - sr.where(sr > 0, fill_value) - else: - # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(psr > 0, fill_value) - got = sr.where(sr > 0, fill_value) - # pandas returns 'float16' dtype, which is not supported in cudf - assert_eq( - expect, - got, - check_dtype=expect.dtype.kind not in ("f"), - ) + try: + scalar_fits = sr.dtype.type(fill_value) == fill_value + except OverflowError: + scalar_fits = False - if sr.dtype.type(fill_value) != fill_value: + if not scalar_fits: with pytest.raises(TypeError): - sr.where(sr < 0, fill_value) + sr.where(op(sr, 0), fill_value) else: - expect = psr.where(psr < 0, fill_value) - got = sr.where(sr < 0, fill_value) - # pandas returns 'float16' dtype, which is not supported in cudf - assert_eq( - expect, - got, - check_dtype=expect.dtype.kind not in ("f"), - ) - - if sr.dtype.type(fill_value) != fill_value: - with pytest.raises(TypeError): - sr.where(sr == 0, fill_value) - else: - expect = psr.where(psr == 0, fill_value) - got = sr.where(sr == 0, fill_value) + # Cast back to original dtype as pandas automatically upcasts + expect = psr.where(op(psr, 0), fill_value) + got = sr.where(op(sr, 0), fill_value) # pandas returns 'float16' dtype, which is not supported in cudf assert_eq( expect, @@ -985,12 +966,8 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) sr = cudf.from_pandas(psr) - if sr.dtype.kind in "ui": - can_replace = np.array([replacement])[0].is_integer() and np.can_cast( - int(replacement), sr.dtype - ) - else: - can_replace = np.can_cast(replacement, sr.dtype) + numpy_replacement = np.array(replacement).astype(sr.dtype)[()] + can_replace = numpy_replacement == replacement # Both Scalar if not can_replace: diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 618c4f30bd9..449f21721f4 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc): if asc: expected = np.argsort(sr.to_numpy(), kind="mergesort") else: - expected = np.argsort(sr.to_numpy() * -1, kind="mergesort") + # -1 multiply works around missing desc sort (may promote to float64) + expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort") np.testing.assert_array_equal(expected, res.to_numpy()) diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index ef078ed8c5d..babe4be2715 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -7,8 +7,8 @@ # Coefficients ranges for inner hash - This are important to set to be # large so that we have randomness in the bottom bits when modding -A_SECOND_LEVEL_POW = np.uint8(48) -B_SECOND_LEVEL_POW = np.uint8(7) +A_SECOND_LEVEL_POW = np.uint64(48) +B_SECOND_LEVEL_POW = np.uint64(7) A_LBOUND_SECOND_LEVEL_HASH = 2**16 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW @@ -23,11 +23,11 @@ # Shifts for bit packing -A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW) -B_SECOND_LEVEL_SHIFT_AMT = np.uint8( +A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW) +B_SECOND_LEVEL_SHIFT_AMT = np.uint64( 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW ) -BITS_FOR_INNER_TABLE_SIZE = np.uint8(8) +BITS_FOR_INNER_TABLE_SIZE = np.uint64(8) NOT_FOUND = -1 @@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin): while True: a = np.random.randint( - A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH + A_LBOUND_SECOND_LEVEL_HASH, + A_HBOUND_SECOND_LEVEL_HASH, ) b = np.random.randint( B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH @@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant): bin_length = len(internal_table) max_bin_length = max(bin_length, max_bin_length) internal_table_coeffs[i] = ( - coeff_a << A_SECOND_LEVEL_SHIFT_AMT - | coeff_b << B_SECOND_LEVEL_SHIFT_AMT - | bin_length - ) - offset_into_flattened_table[i + 1] = ( - offset_into_flattened_table[i] + bin_length + np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT + | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT + | np.uint64(bin_length) ) + offset_into_flattened_table[i + 1] = offset_into_flattened_table[ + i + ] + np.uint64(bin_length) flattened_bins.extend(internal_table) print( From bab0d808bbe6f333b69e7b71a38febdc0e28b773 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 30 May 2024 10:34:07 -0700 Subject: [PATCH 024/340] Fix categorical conversion from chunked arrow arrays (#15886) The current logic for converting arrow dictionary arrays to cudf doesn't properly uniquify categories across chunks of chunked arrays. This PR implements the simplest fix by having arrow combine chunks when this case is encountered. Resolves #6828 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15886 --- python/cudf/cudf/core/frame.py | 7 +++++++ python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b561906afb..d60c206ac24 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -897,6 +897,13 @@ def from_arrow(cls, data: pa.Table) -> Self: # so handling indices and dictionary as two different columns. # This needs be removed once we have hooked libcudf dictionary32 # with categorical. + if any( + isinstance(x.type, pa.DictionaryType) + and isinstance(x, pa.ChunkedArray) + for x in data + ): + data = data.combine_chunks() + dict_indices = {} dict_dictionaries = {} dict_ordered = {} diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8b18e53d320..d76d5eb8065 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1984,6 +1984,18 @@ def test_from_arrow(nelem, data_type): np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy()) +def test_from_arrow_chunked_categories(): + # Verify that categories are properly deduplicated across chunked arrays. + indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) + dictionary = pa.array(["foo", "bar", "baz"]) + dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) + chunked_array = pa.chunked_array([dict_array, dict_array]) + table = pa.table({"a": chunked_array}) + df = cudf.DataFrame.from_arrow(table) + final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist() + assert sorted(final_dictionary) == sorted(dictionary.to_pylist()) + + @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) @pytest.mark.parametrize("data_type", dtypes) def test_to_arrow(nelem, data_type): From 789cbfdd69648fd7ec553922e64accb763ca3c57 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 30 May 2024 15:02:37 -0400 Subject: [PATCH 025/340] Use offsetalator in nvtext::tokenize_with_vocabulary (#15878) Updates the `token_counts_fn` kernel in the `nvtext::tokenize_with_vocabulary` to use the offsetalator instead of hardcoded `size_type` for accessing strings offsets. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15878 --- cpp/src/text/vocabulary_tokenize.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index 8913ce22da8..f012f7ce09a 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -240,10 +240,10 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings, return; } - auto const offsets = - d_strings.child(cudf::strings_column_view::offsets_column_index).data(); - auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; - auto const chars_begin = d_strings.data() + offsets[d_strings.offset()]; + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + auto const offset = offsets_itr[str_idx + d_strings.offset()] - offsets_itr[d_strings.offset()]; + auto const chars_begin = d_strings.data() + offsets_itr[d_strings.offset()]; auto const begin = d_str.data(); auto const end = begin + d_str.size_bytes(); From 476db9fbb4a9969ea7406b916cead38990097fb9 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 30 May 2024 23:42:51 -0500 Subject: [PATCH 026/340] Fix JSON parsing memory corruption - Fix Mixed types nested children removal (#15798) Fixes https://github.com/rapidsai/cudf/issues/15750 The references of deleted child columns are not removed, which caused segfault, and also memory errors (found with valgrind). This fix removes references of child columns and deletes them recursively. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15798 --- cpp/src/io/json/json_column.cu | 17 ++++++++++++++-- cpp/tests/io/json_test.cpp | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 631f8adbd6d..3e587768b11 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -594,8 +594,7 @@ void make_device_json_column(device_span input, col.validity = cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); col.type = json_col_t::StringColumn; - col.child_columns.clear(); // their references should be deleted too. - col.column_order.clear(); + // destroy references of all child columns after this step, by calling remove_child_columns }; path_from_tree tree_path{column_categories, @@ -628,6 +627,19 @@ void make_device_json_column(device_span input, std::vector is_pruned(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); + std::function remove_child_columns = + [&](NodeIndexT this_col_id, device_json_column& col) { + for (auto col_name : col.column_order) { + auto child_id = mapped_columns[{this_col_id, col_name}]; + is_mixed_type_column[child_id] = 1; + remove_child_columns(child_id, col.child_columns.at(col_name)); + mapped_columns.erase({this_col_id, col_name}); + columns.erase(child_id); + } + col.child_columns.clear(); // their references are deleted above. + col.column_order.clear(); + }; + auto name_and_parent_index = [&is_array_of_arrays, &row_array_parent_col_id, &column_parent_ids, @@ -721,6 +733,7 @@ void make_device_json_column(device_span input, auto& col = columns.at(old_col_id).get(); if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { reinitialize_as_string(old_col_id, col); + remove_child_columns(old_col_id, col); // all its children (which are already inserted) are ignored later. } col.forced_as_string_column = true; diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 9d766e80094..5d790e73246 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2679,4 +2679,40 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter) } } +TEST_F(JsonReaderTest, JSONMixedTypeChildren) +{ + std::string const json_str = R"( +{ "Root": { "Key": [ { "EE": "A" } ] } } +{ "Root": { "Key": { } } } +{ "Root": { "Key": [{ "YY": 1}] } } +)"; + // Column "EE" is created and destroyed + // Column "YY" should not be created + + cudf::io::json_reader_options options = + cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_single_quotes(true) + .normalize_whitespace(false) + .mixed_types_as_string(true) + .keep_quotes(true); + + auto result = cudf::io::read_json(options); + + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + // types + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0)); +} + CUDF_TEST_PROGRAM_MAIN() From dec0354b1ac2af981d4e8f13aceb45365838a1d8 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 31 May 2024 08:38:57 -0400 Subject: [PATCH 027/340] Fix multi-replace target count logic for large strings (#15807) Replaces `thrust::count_if` with raw kernel counter to handle large strings (int64 offsets) and > 2GB strings columns. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15807 --- cpp/src/strings/replace/multi.cu | 49 ++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 9025234aa52..f4110707c79 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -30,23 +30,17 @@ #include #include #include -#include #include #include #include #include -#include #include -#include #include #include #include #include -#include -#include -#include #include namespace cudf { @@ -262,6 +256,38 @@ struct replace_multi_parallel_fn { device_span d_replacements; }; +constexpr int64_t block_size = 512; // number of threads per block +constexpr size_type bytes_per_thread = 4; // bytes processed per thread + +/** + * @brief Count the number of targets in a strings column + * + * @param fn Functor containing has_target() function + * @param chars_bytes Number of bytes in the strings column + * @param d_output Result of the count + */ +CUDF_KERNEL void count_targets(replace_multi_parallel_fn fn, int64_t chars_bytes, int64_t* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const byte_idx = static_cast(idx) * bytes_per_thread; + auto const lane_idx = static_cast(threadIdx.x); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + int64_t count = 0; + // each thread processes multiple bytes + for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) { + count += fn.has_target(i, chars_bytes); + } + auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum()); + + if ((lane_idx == 0) && (total > 0)) { + cuda::atomic_ref ref{*d_output}; + ref.fetch_add(total, cuda::std::memory_order_relaxed); + } +} + /** * @brief Used by the copy-if function to produce target_pair objects * @@ -308,12 +334,11 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // Count the number of targets in the entire column. // Note this may over-count in the case where a target spans adjacent strings. - auto target_count = thrust::count_if( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - [fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); }); - + rmm::device_scalar d_count(0, stream); + auto const num_blocks = util::div_rounding_up_safe( + util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); + count_targets<<>>(fn, chars_bytes, d_count.data()); + auto target_count = d_count.value(stream); // Create a vector of every target position in the chars column. // These may also include overlapping targets which will be resolved later. auto targets_positions = rmm::device_uvector(target_count, stream); From e7be142b2bfd4f08c18d0020a959e162f01d819e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 31 May 2024 08:14:55 -0700 Subject: [PATCH 028/340] Migrate round to pylibcudf (#15863) xref #15162 Migrate round.pxd to use pylibcudf APIs. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15863 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../user_guide/api_docs/pylibcudf/round.rst | 6 +++ .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + .../_lib/pylibcudf/libcudf/CMakeLists.txt | 2 +- .../cudf/_lib/pylibcudf/libcudf/round.pxd | 6 +-- .../cudf/_lib/pylibcudf/libcudf/round.pyx | 0 python/cudf/cudf/_lib/pylibcudf/round.pxd | 13 +++++ python/cudf/cudf/_lib/pylibcudf/round.pyx | 54 +++++++++++++++++++ python/cudf/cudf/_lib/round.pyx | 36 +++++-------- .../cudf/cudf/pylibcudf_tests/test_round.py | 38 +++++++++++++ 12 files changed, 134 insertions(+), 27 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/round.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/round.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_round.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 1c1b37e2c37..26875ce7d12 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -22,6 +22,7 @@ This page provides API documentation for pylibcudf. reduce reshape rolling + round scalar search stream_compaction diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst new file mode 100644 index 00000000000..c97fda12301 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst @@ -0,0 +1,6 @@ +===== +round +===== + +.. automodule:: cudf._lib.pylibcudf.round + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 7d01671e84f..eff14ad549b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -29,6 +29,7 @@ set(cython_sources replace.pyx reshape.pyx rolling.pyx + round.pyx scalar.pyx search.pyx stream_compaction.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 91c3fdf5602..4f77f8cbaef 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -15,6 +15,7 @@ from . cimport ( replace, reshape, rolling, + round, search, sorting, stream_compaction, @@ -48,6 +49,7 @@ __all__ = [ "reduce", "replace", "rolling", + "round", "search", "stream_compaction", "strings", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index fcdc4992f00..048b62b6013 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -15,6 +15,7 @@ replace, reshape, rolling, + round, search, sorting, stream_compaction, @@ -48,6 +49,7 @@ "reduce", "replace", "rolling", + "round", "search", "stream_compaction", "strings", diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 8a6ce6a5187..ac56d42dda8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd round.pyx stream_compaction.pyx types.pyx unary.pyx ) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd index 06ff42485ea..027c4634c9f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd @@ -9,9 +9,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/round.hpp" namespace "cudf" nogil: - ctypedef enum rounding_method "cudf::rounding_method": - HALF_UP "cudf::rounding_method::HALF_UP" - HALF_EVEN "cudf::rounding_method::HALF_EVEN" + cpdef enum class rounding_method(int32_t): + HALF_UP + HALF_EVEN cdef unique_ptr[column] round ( const column_view& input, diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pxd b/python/cudf/cudf/_lib/pylibcudf/round.pxd new file mode 100644 index 00000000000..ccb64fc2847 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/round.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t + +from cudf._lib.pylibcudf.libcudf.round cimport rounding_method + +from .column cimport Column + + +cpdef Column round( + Column source, + int32_t decimal_places = *, + rounding_method round_method = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pyx b/python/cudf/cudf/_lib/pylibcudf/round.pyx new file mode 100644 index 00000000000..cfcc2aafbb8 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/round.pyx @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.round cimport ( + round as cpp_round, + rounding_method, +) + +from cudf._lib.pylibcudf.libcudf.round import \ + rounding_method as RoundingMethod # no-cython-lint + +from cudf._lib.pylibcudf.libcudf.column.column cimport column + +from .column cimport Column + + +cpdef Column round( + Column source, + int32_t decimal_places = 0, + rounding_method round_method = rounding_method.HALF_UP +): + """Rounds all the values in a column to the specified number of decimal places. + + For details, see :cpp:func:`round`. + + Parameters + ---------- + source : Column + The Column for which to round values. + decimal_places: int32_t, optional + The number of decimal places to round to (default 0) + round_method: rounding_method, optional + The method by which to round each value. + Can be one of { RoundingMethod.HALF_UP, RoundingMethod.HALF_EVEN } + (default rounding_method.HALF_UP) + + Returns + ------- + pylibcudf.Column + A Column with values rounded + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_round( + source.view(), + decimal_places, + round_method + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx index c1c36dd8854..f8ad57947c8 100644 --- a/python/cudf/cudf/_lib/round.pyx +++ b/python/cudf/cudf/_lib/round.pyx @@ -2,16 +2,10 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.round cimport ( - round as cpp_round, - rounding_method as cpp_rounding_method, -) + +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.round import RoundingMethod @acquire_spill_lock() @@ -31,19 +25,15 @@ def round(Column input_col, int decimal_places=0, how="half_even"): if how not in {"half_even", "half_up"}: raise ValueError("'how' must be either 'half_even' or 'half_up'") - cdef column_view input_col_view = input_col.view() - cdef unique_ptr[column] c_result - cdef cpp_rounding_method c_how = ( - cpp_rounding_method.HALF_EVEN if how == "half_even" - else cpp_rounding_method.HALF_UP + how = ( + RoundingMethod.HALF_EVEN if how == "half_even" + else RoundingMethod.HALF_UP ) - with nogil: - c_result = move( - cpp_round( - input_col_view, - decimal_places, - c_how - ) - ) - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.round.round( + input_col.to_pylibcudf(mode="read"), + decimal_places, + how + ) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/cudf/cudf/pylibcudf_tests/test_round.py new file mode 100644 index 00000000000..a234860477f --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_round.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(params=[False, True]) +def nullable(request): + return request.param + + +@pytest.fixture(params=["float32", "float64"]) +def column(request, nullable): + values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5] + typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param] + if nullable: + values[2] = None + return plc.interop.from_arrow(pa.array(values, type=typ)) + + +@pytest.mark.parametrize( + "round_mode", ["half_towards_infinity", "half_to_even"] +) +@pytest.mark.parametrize("decimals", [0, 1, 2, 5]) +def test_round(column, round_mode, decimals): + method = { + "half_towards_infinity": plc.round.RoundingMethod.HALF_UP, + "half_to_even": plc.round.RoundingMethod.HALF_EVEN, + }[round_mode] + got = plc.round.round(column, decimals, method) + expect = pa.compute.round( + plc.interop.to_arrow(column), decimals, round_mode + ) + + assert_column_eq(expect, got) From 7949a9cf6911066663e2245a4bb624e0f1847b06 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 31 May 2024 14:54:18 -0400 Subject: [PATCH 029/340] Use offsetalator in orc rowgroup_char_counts_kernel (#15891) Replaces hardcoded `size_type` for accessing strings offsets data with the offsetalator to compute the number of characters in a group in `cudf::io::orc::gpu::rowgroup_char_counts_kernel` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15891 --- cpp/src/io/orc/dict_enc.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5971482f80c..5181c4a1c0e 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -16,6 +16,7 @@ #include "orc_gpu.hpp" +#include #include #include #include @@ -43,11 +44,12 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan char_count auto const start_row = rowgroup_bounds[row_group_idx][col_idx].begin + str_col.offset(); auto const num_rows = rowgroup_bounds[row_group_idx][col_idx].size(); - auto const& offsets = str_col.child(strings_column_view::offsets_column_index); + auto const& offsets = str_col.child(strings_column_view::offsets_column_index); + auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); char_counts[str_col_idx][row_group_idx] = (num_rows == 0) ? 0 - : offsets.element(start_row + num_rows) - offsets.element(start_row); + : static_cast(offsets_itr[start_row + num_rows] - offsets_itr[start_row]); } void rowgroup_char_counts(device_2dspan counts, From 1354abdb7a4f9eb58bfc6e359c49d0baabacb4e1 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 31 May 2024 16:03:09 -0400 Subject: [PATCH 030/340] Fix url-decode benchmark to use offsetalator (#15871) Fixes the logic for generating URLs in the url-decoder benchmark to use the offsetalator instead of hardcoding `size_type`. This will allow benchmarking with large strings column in the future. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15871 --- cpp/benchmarks/string/url_decode.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu index b3aeb69e5ea..7720e585023 100644 --- a/cpp/benchmarks/string/url_decode.cu +++ b/cpp/benchmarks/string/url_decode.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -43,7 +44,7 @@ struct url_string_generator { { } - __device__ void operator()(thrust::tuple str_begin_end) + __device__ void operator()(thrust::tuple str_begin_end) { auto begin = thrust::get<0>(str_begin_end); auto end = thrust::get<1>(str_begin_end); @@ -69,11 +70,11 @@ auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, do auto result_col = std::move(table_a->release()[0]); // string column with num_rows aaa... auto chars_data = static_cast(result_col->mutable_view().head()); auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view(); + auto offset_itr = cudf::detail::offsetalator_factory::make_input_iterator(offset_col); auto engine = thrust::default_random_engine{}; thrust::for_each_n(thrust::device, - thrust::make_zip_iterator(offset_col.begin(), - offset_col.begin() + 1), + thrust::make_zip_iterator(offset_itr, offset_itr + 1), num_rows, url_string_generator{chars_data, esc_seq_chance, engine}); return result_col; From e66f4f50d045da87125430d13e6b862dc845845c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 3 Jun 2024 10:14:58 -0700 Subject: [PATCH 031/340] Add an option to run cuIO benchmarks with pinned buffers as input (#15830) Adds `io_type::PINNED_BUFFER`, which allows cuIO benchmarks to use a pinned buffer as an input. The output is still a `std::vector` in this case, same as with `io_type::HOST_BUFFER`. Also stops the used of `cudf::io::io_type` in benchmarks, to allow benchmark-specific IO types, such as this one. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15830 --- cpp/benchmarks/io/csv/csv_reader_input.cpp | 16 +++++-------- cpp/benchmarks/io/csv/csv_writer.cpp | 8 +++---- cpp/benchmarks/io/cuio_common.cpp | 23 ++++++++++++------- cpp/benchmarks/io/cuio_common.hpp | 14 ++++++++--- cpp/benchmarks/io/json/json_reader_input.cpp | 14 +++++------ cpp/benchmarks/io/json/json_writer.cpp | 9 ++++---- cpp/benchmarks/io/nvbench_helpers.hpp | 11 +++++---- cpp/benchmarks/io/orc/orc_reader_input.cpp | 16 ++++++------- cpp/benchmarks/io/orc/orc_writer.cpp | 8 +++---- .../io/parquet/parquet_reader_multithread.cpp | 18 +++++++++++---- cpp/benchmarks/io/parquet/parquet_writer.cpp | 8 +++---- 11 files changed, 77 insertions(+), 68 deletions(-) diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp index 2ad3bc36f59..a93bc05ac58 100644 --- a/cpp/benchmarks/io/csv/csv_reader_input.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,9 +28,7 @@ constexpr size_t data_size = 256 << 20; constexpr cudf::size_type num_cols = 64; template -void csv_read_common(DataType const& data_types, - cudf::io::io_type const& source_type, - nvbench::state& state) +void csv_read_common(DataType const& data_types, io_type const& source_type, nvbench::state& state) { auto const tbl = create_random_table(cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}); @@ -66,7 +64,7 @@ void csv_read_common(DataType const& data_types, state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } -template +template void BM_csv_read_input(nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { @@ -76,7 +74,7 @@ void BM_csv_read_input(nvbench::state& state, csv_read_common(d_type, source_type, state); } -template +template void BM_csv_read_io(nvbench::state& state, nvbench::type_list>) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), @@ -97,12 +95,10 @@ using d_type_list = nvbench::enum_type_list; -using io_list = - nvbench::enum_type_list; +using io_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_csv_read_input, - NVBENCH_TYPE_AXES(d_type_list, - nvbench::enum_type_list)) + NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) .set_name("csv_read_data_type") .set_type_axes_names({"data_type", "io"}) .set_min_samples(4); diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp index 8ff07be1531..7ba43850cf2 100644 --- a/cpp/benchmarks/io/csv/csv_writer.cpp +++ b/cpp/benchmarks/io/csv/csv_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ constexpr size_t data_size = 256 << 20; constexpr cudf::size_type num_cols = 64; -template +template void BM_csv_write_dtype_io(nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { @@ -112,9 +112,7 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_csv_write_dtype_io, NVBENCH_TYPE_AXES(d_type_list, io_list)) .set_name("csv_write_dtype_io") diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 3a61e5f1e7b..37ced8ea703 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -52,6 +52,11 @@ cudf::io::source_info cuio_source_sink_pair::make_source_info() switch (type) { case io_type::FILEPATH: return cudf::io::source_info(file_name); case io_type::HOST_BUFFER: return cudf::io::source_info(h_buffer.data(), h_buffer.size()); + case io_type::PINNED_BUFFER: { + pinned_buffer.resize(h_buffer.size()); + std::copy(h_buffer.begin(), h_buffer.end(), pinned_buffer.begin()); + return cudf::io::source_info(pinned_buffer.data(), pinned_buffer.size()); + } case io_type::DEVICE_BUFFER: { // TODO: make cuio_source_sink_pair stream-friendly and avoid implicit use of the default // stream @@ -71,7 +76,8 @@ cudf::io::sink_info cuio_source_sink_pair::make_sink_info() switch (type) { case io_type::VOID: return cudf::io::sink_info(void_sink.get()); case io_type::FILEPATH: return cudf::io::sink_info(file_name); - case io_type::HOST_BUFFER: [[fallthrough]]; + case io_type::HOST_BUFFER: + case io_type::PINNED_BUFFER: case io_type::DEVICE_BUFFER: return cudf::io::sink_info(&h_buffer); default: CUDF_FAIL("invalid output type"); } @@ -84,7 +90,8 @@ size_t cuio_source_sink_pair::size() case io_type::FILEPATH: return static_cast( std::ifstream(file_name, std::ifstream::ate | std::ifstream::binary).tellg()); - case io_type::HOST_BUFFER: [[fallthrough]]; + case io_type::HOST_BUFFER: + case io_type::PINNED_BUFFER: case io_type::DEVICE_BUFFER: return h_buffer.size(); default: CUDF_FAIL("invalid output type"); } @@ -204,13 +211,13 @@ void try_drop_l3_cache() "Failed to execute the drop cache command"); } -cudf::io::io_type retrieve_io_type_enum(std::string_view io_string) +io_type retrieve_io_type_enum(std::string_view io_string) { - if (io_string == "FILEPATH") { return cudf::io::io_type::FILEPATH; } - if (io_string == "HOST_BUFFER") { return cudf::io::io_type::HOST_BUFFER; } - if (io_string == "DEVICE_BUFFER") { return cudf::io::io_type::DEVICE_BUFFER; } - if (io_string == "VOID") { return cudf::io::io_type::VOID; } - if (io_string == "USER_IMPLEMENTED") { return cudf::io::io_type::USER_IMPLEMENTED; } + if (io_string == "FILEPATH") { return io_type::FILEPATH; } + if (io_string == "HOST_BUFFER") { return io_type::HOST_BUFFER; } + if (io_string == "PINNED_BUFFER") { return io_type::PINNED_BUFFER; } + if (io_string == "DEVICE_BUFFER") { return io_type::DEVICE_BUFFER; } + if (io_string == "VOID") { return io_type::VOID; } CUDF_FAIL("Unsupported io_type."); } diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index 6e0b32219ce..d4f39a5f243 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -18,13 +18,20 @@ #include +#include #include #include -#include #include -using cudf::io::io_type; +// IO types supported in the benchmarks +enum class io_type { + FILEPATH, // Input/output are both files + HOST_BUFFER, // Input/output are both host buffers (pageable) + PINNED_BUFFER, // Input is a pinned host buffer, output is a host buffer (pageable) + DEVICE_BUFFER, // Input is a device buffer, output is a host buffer (pageable) + VOID +}; std::string random_file_in_dir(std::string const& dir_path); @@ -72,6 +79,7 @@ class cuio_source_sink_pair { io_type const type; std::vector h_buffer; + cudf::detail::pinned_host_vector pinned_buffer; rmm::device_uvector d_buffer; std::string const file_name; std::unique_ptr void_sink; @@ -144,7 +152,7 @@ void try_drop_l3_cache(); * * @return The io_type enum value */ -cudf::io::io_type retrieve_io_type_enum(std::string_view io_string); +io_type retrieve_io_type_enum(std::string_view io_string); /** * @brief Convert a string to the corresponding compression_type enum value. diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp index aa73dacdbc5..4366790f208 100644 --- a/cpp/benchmarks/io/json/json_reader_input.cpp +++ b/cpp/benchmarks/io/json/json_reader_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,7 +70,7 @@ cudf::size_type json_write_bm_data(cudf::io::sink_info sink, return view.num_rows(); } -template +template void BM_json_read_io(nvbench::state& state, nvbench::type_list>) { cuio_source_sink_pair source_sink(IO); @@ -87,7 +87,7 @@ void BM_json_read_io(nvbench::state& state, nvbench::type_list +template void BM_json_read_data_type( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { @@ -107,16 +107,14 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = + nvbench::enum_type_list; using compression_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_json_read_data_type, - NVBENCH_TYPE_AXES(d_type_list, - nvbench::enum_type_list)) + NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) .set_name("json_read_data_type") .set_type_axes_names({"data_type", "io"}) .set_min_samples(4); diff --git a/cpp/benchmarks/io/json/json_writer.cpp b/cpp/benchmarks/io/json/json_writer.cpp index ae6bb81ff93..444457bbf0d 100644 --- a/cpp/benchmarks/io/json/json_writer.cpp +++ b/cpp/benchmarks/io/json/json_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,7 +52,7 @@ void json_write_common(cudf::io::json_writer_options const& write_opts, state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } -template +template void BM_json_write_io(nvbench::state& state, nvbench::type_list>) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), @@ -114,9 +114,8 @@ void BM_json_writer_options(nvbench::state& state) json_write_common(write_opts, source_sink, data_size, state); } -using io_list = nvbench::enum_type_list; +using io_list = + nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_json_write_io, NVBENCH_TYPE_AXES(io_list)) .set_name("json_write_io") diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp index 8b79912c7ee..1e3ab2b7b4f 100644 --- a/cpp/benchmarks/io/nvbench_helpers.hpp +++ b/cpp/benchmarks/io/nvbench_helpers.hpp @@ -56,13 +56,14 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( [](auto) { return std::string{}; }) NVBENCH_DECLARE_ENUM_TYPE_STRINGS( - cudf::io::io_type, + io_type, [](auto value) { switch (value) { - case cudf::io::io_type::FILEPATH: return "FILEPATH"; - case cudf::io::io_type::HOST_BUFFER: return "HOST_BUFFER"; - case cudf::io::io_type::DEVICE_BUFFER: return "DEVICE_BUFFER"; - case cudf::io::io_type::VOID: return "VOID"; + case io_type::FILEPATH: return "FILEPATH"; + case io_type::HOST_BUFFER: return "HOST_BUFFER"; + case io_type::PINNED_BUFFER: return "PINNED_BUFFER"; + case io_type::DEVICE_BUFFER: return "DEVICE_BUFFER"; + case io_type::VOID: return "VOID"; default: return "Unknown"; } }, diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp index b7c214a8374..cafd3cc5c39 100644 --- a/cpp/benchmarks/io/orc/orc_reader_input.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp @@ -87,7 +87,7 @@ void orc_read_common(cudf::size_type num_rows_to_read, } // namespace -template +template void BM_orc_read_data(nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { @@ -112,7 +112,7 @@ void BM_orc_read_data(nvbench::state& state, orc_read_common(num_rows_written, source_sink, state); } -template +template void orc_read_io_compression(nvbench::state& state) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), @@ -150,7 +150,7 @@ void orc_read_io_compression(nvbench::state& state) orc_read_common(num_rows_written, source_sink, state); } -template +template void BM_orc_read_io_compression( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) @@ -163,7 +163,7 @@ void BM_orc_chunked_read_io_compression(nvbench::state& state, nvbench::type_list>) { // Only run benchmark using HOST_BUFFER IO. - return orc_read_io_compression(state); + return orc_read_io_compression(state); } using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = + nvbench::enum_type_list; using compression_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_orc_read_data, - NVBENCH_TYPE_AXES(d_type_list, - nvbench::enum_type_list)) + NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) .set_name("orc_read_decode") .set_type_axes_names({"data_type", "io"}) .set_min_samples(4) diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index bb373297222..b795f3e3164 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,7 +82,7 @@ void BM_orc_write_encode(nvbench::state& state, nvbench::type_list +template void BM_orc_write_io_compression( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) @@ -183,9 +183,7 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = nvbench::enum_type_list; using compression_list = nvbench::enum_type_list; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index bd80c4e0e88..a67d1932951 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -62,7 +62,7 @@ std::tuple, size_t, size_t> write_file_data( size_t total_file_size = 0; for (size_t i = 0; i < num_files; ++i) { - cuio_source_sink_pair source_sink{cudf::io::io_type::HOST_BUFFER}; + cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; auto const tbl = create_random_table( cycle_dtypes(d_types, num_cols), @@ -96,6 +96,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, cudf::detail::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + std::vector source_info_vector; + std::transform(source_sink_vector.begin(), + source_sink_vector.end(), + std::back_inserter(source_info_vector), + [](auto& source_sink) { return source_sink.make_source_info(); }); auto mem_stats_logger = cudf::memory_stats_logger(); @@ -104,9 +109,8 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, [&](nvbench::launch& launch, auto& timer) { auto read_func = [&](int index) { auto const stream = streams[index % num_threads]; - auto& source_sink = source_sink_vector[index]; cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); + cudf::io::parquet_reader_options::builder(source_info_vector[index]); cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource()); }; @@ -174,6 +178,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); cudf::detail::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + std::vector source_info_vector; + std::transform(source_sink_vector.begin(), + source_sink_vector.end(), + std::back_inserter(source_info_vector), + [](auto& source_sink) { return source_sink.make_source_info(); }); auto mem_stats_logger = cudf::memory_stats_logger(); @@ -183,9 +192,8 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, [&](nvbench::launch& launch, auto& timer) { auto read_func = [&](int index) { auto const stream = streams[index % num_threads]; - auto& source_sink = source_sink_vector[index]; cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); + cudf::io::parquet_reader_options::builder(source_info_vector[index]); // divide chunk limits by number of threads so the number of chunks produced is the // same for all cases. this seems better than the alternative, which is to keep the // limits the same. if we do that, as the number of threads goes up, the number of diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 13b396ea267..46d2927a92b 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,7 +82,7 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list +template void BM_parq_write_io_compression( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) @@ -188,9 +188,7 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; +using io_list = nvbench::enum_type_list; using compression_list = nvbench::enum_type_list; From ba1299dfc03e87f11cf021a67d01531ed6afd7f7 Mon Sep 17 00:00:00 2001 From: Brian Tepera Date: Mon, 3 Jun 2024 13:45:09 -0400 Subject: [PATCH 032/340] Implement day_name and month_name to match pandas (#15479) This PR implements the `month_name` and `day_name` datetime methods, matching the equivalent [month_name](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.month_name.html) and [day_name](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.day_name.html) methods from pandas. Currently this is implemented just for English locale, though it could be expanded to include additional languages in the future. Closes #12407 Authors: - Brian Tepera (https://github.com/btepera) - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15479 --- python/cudf/cudf/core/column/datetime.py | 29 ++++++++++ python/cudf/cudf/core/index.py | 39 +++++++++++++ python/cudf/cudf/core/series.py | 72 ++++++++++++++++++++++++ python/cudf/cudf/tests/test_datetime.py | 39 +++++++++++++ 4 files changed, 179 insertions(+) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d92a3a00641..27f31c8f500 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -2,6 +2,7 @@ from __future__ import annotations +import calendar import datetime import functools import locale @@ -339,6 +340,34 @@ def element_indexing(self, index: int): def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) + def _get_field_names( + self, + field: Literal["month", "weekday"], + labels: list[str], + locale: str | None = None, + ) -> ColumnBase: + if locale is not None: + raise NotImplementedError( + "Setting a locale is currently not supported. " + "Results will be returned in your current locale." + ) + col_labels = as_column(labels) + indices = self.get_dt_field(field) + has_nulls = indices.has_nulls() + if has_nulls: + indices = indices.fillna(len(col_labels)) + return col_labels.take(indices, nullify=True, check_bounds=has_nulls) + + def get_day_names(self, locale: str | None = None) -> ColumnBase: + return self._get_field_names( + "weekday", list(calendar.day_name), locale=locale + ) + + def get_month_names(self, locale: str | None = None) -> ColumnBase: + return self._get_field_names( + "month", list(calendar.month_name), locale=locale + ) + def ceil(self, freq: str) -> ColumnBase: return libcudf.datetime.ceil_datetime(self, freq) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 49bfb150f60..2a75b374a1e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2120,6 +2120,45 @@ def quarter(self): res = extract_quarter(self._values) return Index(res, dtype="int8") + @_cudf_nvtx_annotate + def day_name(self, locale: str | None = None) -> Index: + """ + Return the day names. Currently supports English locale only. + + Examples + -------- + >>> import cudf + >>> datetime_index = cudf.date_range("2016-12-31", "2017-01-08", freq="D") + >>> datetime_index + DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', + '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07'], + dtype='datetime64[ns]', freq='D') + >>> datetime_index.day_name() + Index(['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday'], dtype='object') + """ + day_names = self._column.get_day_names(locale) + return Index._from_data({self.name: day_names}) + + @_cudf_nvtx_annotate + def month_name(self, locale: str | None = None) -> Index: + """ + Return the month names. Currently supports English locale only. + + Examples + -------- + >>> import cudf + >>> datetime_index = cudf.date_range("2017-12-30", periods=6, freq='W') + >>> datetime_index + DatetimeIndex(['2017-12-30', '2018-01-06', '2018-01-13', '2018-01-20', + '2018-01-27', '2018-02-03'], + dtype='datetime64[ns]', freq='7D') + >>> datetime_index.month_name() + Index(['December', 'January', 'January', 'January', 'January', 'February'], dtype='object') + """ + month_names = self._column.get_month_names(locale) + return Index._from_data({self.name: month_names}) + @_cudf_nvtx_annotate def isocalendar(self): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 908347e389b..a5b204ef346 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4201,6 +4201,78 @@ def quarter(self): name=self.series.name, ) + @_cudf_nvtx_annotate + def day_name(self, locale=None): + """ + Return the day names. Currently supports English locale only. + + Examples + -------- + >>> import cudf + >>> datetime_series = cudf.Series(cudf.date_range('2016-12-31', + ... '2017-01-08', freq='D')) + >>> datetime_series + 0 2016-12-31 + 1 2017-01-01 + 2 2017-01-02 + 3 2017-01-03 + 4 2017-01-04 + 5 2017-01-05 + 6 2017-01-06 + 7 2017-01-07 + 8 2017-01-08 + dtype: datetime64[ns] + >>> datetime_series.dt.day_name() + 0 Saturday + 1 Sunday + 2 Monday + 3 Tuesday + 4 Wednesday + 5 Thursday + 6 Friday + 7 Saturday + dtype: object + """ + day_names = self.series._column.get_day_names(locale) + return Series._from_data( + ColumnAccessor({None: day_names}), + index=self.series.index, + name=self.series.name, + ) + + @_cudf_nvtx_annotate + def month_name(self, locale: str | None = None) -> Series: + """ + Return the month names. Currently supports English locale only. + + Examples + -------- + >>> import cudf + >>> datetime_series = cudf.Series(cudf.date_range("2017-12-30", periods=6, freq='W')) + >>> datetime_series + 0 2017-12-30 + 1 2018-01-06 + 2 2018-01-13 + 3 2018-01-20 + 4 2018-01-27 + 5 2018-02-03 + dtype: datetime64[ns] + >>> datetime_series.dt.month_name() + 0 December + 1 January + 2 January + 3 January + 4 January + 5 February + dtype: object + """ + month_names = self.series._column.get_month_names(locale) + return Series._from_data( + ColumnAccessor({None: month_names}), + index=self.series.index, + name=self.series.name, + ) + @_cudf_nvtx_annotate def isocalendar(self): """ diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 46a0dcd315d..4186fff038a 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2419,3 +2419,42 @@ def test_date_range_tz(): result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") assert_eq(result, expected) + + +@pytest.mark.parametrize("meth", ["day_name", "month_name"]) +@pytest.mark.parametrize("klass", [pd.Series, pd.DatetimeIndex]) +def test_day_month_name(meth, klass): + data = [ + "2020-05-31 08:00:00", + None, + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + None, + "1900-02-28 07:00:00", + "1800-03-14 07:30:00", + "2100-03-14 07:30:00", + "1970-01-01 00:00:00", + "1969-12-31 12:59:00", + ] + + p_obj = klass(data, dtype="datetime64[s]") + g_obj = cudf.from_pandas(p_obj) + + if klass is pd.Series: + p_obj = p_obj.dt + g_obj = g_obj.dt + + expect = getattr(p_obj, meth)() + got = getattr(g_obj, meth)() + + assert_eq(expect, got) + + +@pytest.mark.parametrize("meth", ["day_name", "month_name"]) +@pytest.mark.parametrize("klass", [cudf.Series, cudf.DatetimeIndex]) +def test_day_month_name_locale_not_implemented(meth, klass): + obj = klass(cudf.date_range("2020-01-01", periods=7)) + if klass is cudf.Series: + obj = obj.dt + with pytest.raises(NotImplementedError): + getattr(obj, meth)(locale="pt_BR.utf8") From 7d5561a8c0aeb8531913d7767faca55a5ab31fa5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 3 Jun 2024 15:29:39 -0400 Subject: [PATCH 033/340] Fix debug assert in rowgroup_char_counts_kernel (#15902) Fixes assert triggered by `OrcWriterTest.EmptyChildStringColumn` in a Debug build. ``` $ gtests/ORC_TEST --gtest_filter=OrcWriterTest.EmptyChildStringColumn Note: Google Test filter = OrcWriterTest.EmptyChildStringColumn [==========] Running 1 test from 1 test suite. [----------] Global test environment set-up. [----------] 1 test from OrcWriterTest [ RUN ] OrcWriterTest.EmptyChildStringColumn /cudf/cpp/include/cudf/detail/offsets_iterator.cuh:79: cudf::detail::input_offsetalator::input_offsetalator(const void *, cudf::data_type, int): block: [0,0,0], thread: [0,0,0] Assertion `(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && "Unexpected offsets type"` failed. CUDA Error detected. cudaErrorAssert device-side assert triggered ORC_TEST: /conda/envs/rapids/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp:248: void rmm::mr::detail::stream_ordered_memory_resource::do_deallocate(void*, std::size_t, rmm::cuda_stream_view) [with PoolResource = rmm::mr::pool_memory_resource; FreeListType = rmm::mr::detail::coalescing_free_list; std::size_t = long unsigned int]: Assertion `status__ == cudaSuccess' failed. Aborted (core dumped) ``` Error introduced in #15891 where offsetalator wraps an offsets column in the `cudf::io::orc::gpu::rowgroup_char_counts_kernel`. But when `num_rows==0` the offsets column is `EMPTY` causing the assert to trigger. Checking the `num_rows` before accessing the offsets column fixes the issue. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/15902 --- cpp/src/io/orc/dict_enc.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5181c4a1c0e..5be75350951 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -44,12 +44,13 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan char_count auto const start_row = rowgroup_bounds[row_group_idx][col_idx].begin + str_col.offset(); auto const num_rows = rowgroup_bounds[row_group_idx][col_idx].size(); - auto const& offsets = str_col.child(strings_column_view::offsets_column_index); - auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); - char_counts[str_col_idx][row_group_idx] = - (num_rows == 0) - ? 0 - : static_cast(offsets_itr[start_row + num_rows] - offsets_itr[start_row]); + size_type char_count = 0; + if (num_rows > 0) { + auto const& offsets = str_col.child(strings_column_view::offsets_column_index); + auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + char_count = static_cast(offsets_itr[start_row + num_rows] - offsets_itr[start_row]); + } + char_counts[str_col_idx][row_group_idx] = char_count; } void rowgroup_char_counts(device_2dspan counts, From 4a17c451719a5d1e144b21703650bd323990e892 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 3 Jun 2024 15:32:12 -0400 Subject: [PATCH 034/340] Rename strings multiple target replace API (#15898) Renames the multi-target overload of `cudf::strings::replace()` to `cudf::strings::replace_multiple()`. This helps with some Cython issues involving fused types and overloaded functions with the same number of arguments. Reference: https://github.com/rapidsai/cudf/issues/15855#issuecomment-2129980298 This change deprecates the old name to be removed in a future release. Also added some additional error unit tests. Closes #15855 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15898 --- cpp/benchmarks/string/replace.cpp | 2 +- cpp/include/cudf/strings/detail/replace.hpp | 12 +++---- cpp/include/cudf/strings/replace.hpp | 14 +++++++- cpp/src/strings/replace/multi.cu | 23 +++++++++---- cpp/tests/json/json_tests.cpp | 2 +- cpp/tests/streams/strings/replace_test.cpp | 4 +-- cpp/tests/strings/replace_tests.cpp | 33 +++++++++++++++---- java/src/main/native/src/ColumnViewJni.cpp | 2 +- .../pylibcudf/libcudf/strings/replace.pxd | 2 +- python/cudf/cudf/_lib/strings/replace.pyx | 3 +- 10 files changed, 71 insertions(+), 26 deletions(-) diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp index c8f26142193..3d9d51bfd6d 100644 --- a/cpp/benchmarks/string/replace.cpp +++ b/cpp/benchmarks/string/replace.cpp @@ -52,7 +52,7 @@ static void BM_replace(benchmark::State& state, replace_type rt) case scalar: cudf::strings::replace(input, target, repl); break; case slice: cudf::strings::replace_slice(input, repl, 1, 10); break; case multi: - cudf::strings::replace( + cudf::strings::replace_multiple( input, cudf::strings_column_view(targets), cudf::strings_column_view(repls)); break; } diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp index aad89beb47e..481d00f1bce 100644 --- a/cpp/include/cudf/strings/detail/replace.hpp +++ b/cpp/include/cudf/strings/detail/replace.hpp @@ -39,14 +39,14 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&, + * @copydoc cudf::strings::replace_multiple(strings_column_view const&, strings_column_view const&, * strings_column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref) */ -std::unique_ptr replace(strings_column_view const& strings, - strings_column_view const& targets, - strings_column_view const& repls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr replace_mutiple(strings_column_view const& strings, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Replaces any null string entries with the given string. diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp index 9525db44b69..a19aa9be0c0 100644 --- a/cpp/include/cudf/strings/replace.hpp +++ b/cpp/include/cudf/strings/replace.hpp @@ -153,7 +153,19 @@ std::unique_ptr replace_slice( * @param mr Device memory resource used to allocate the returned column's device memory * @return New strings column */ -std::unique_ptr replace( +std::unique_ptr replace_multiple( + strings_column_view const& input, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::strings::replace_multiple + * + * @deprecated since 24.08 + */ +[[deprecated]] std::unique_ptr replace( strings_column_view const& input, strings_column_view const& targets, strings_column_view const& repls, diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index f4110707c79..8e5c5cf60b8 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -499,11 +499,11 @@ std::unique_ptr replace_string_parallel(strings_column_view const& input } // namespace -std::unique_ptr replace(strings_column_view const& input, - strings_column_view const& targets, - strings_column_view const& repls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr replace_multiple(strings_column_view const& input, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { if (input.is_empty()) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)), @@ -524,6 +524,17 @@ std::unique_ptr replace(strings_column_view const& input, // external API +std::unique_ptr replace_multiple(strings_column_view const& strings, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::replace_multiple(strings, targets, repls, stream, mr); +} + +// deprecated in 24.08 std::unique_ptr replace(strings_column_view const& strings, strings_column_view const& targets, strings_column_view const& repls, @@ -531,7 +542,7 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::replace(strings, targets, repls, stream, mr); + return detail::replace_multiple(strings, targets, repls, stream, mr); } } // namespace strings diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index 6c9050becc1..e38ca6628f3 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -76,7 +76,7 @@ std::unique_ptr drop_whitespace(cudf::column_view const& col) cudf::strings_column_view strings(col); cudf::strings_column_view targets(whitespace); cudf::strings_column_view replacements(repl); - return cudf::strings::replace(strings, targets, replacements); + return cudf::strings::replace_multiple(strings, targets, replacements); } struct JsonPathTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/strings/replace_test.cpp b/cpp/tests/streams/strings/replace_test.cpp index fc87460b706..95c1209b5db 100644 --- a/cpp/tests/streams/strings/replace_test.cpp +++ b/cpp/tests/streams/strings/replace_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ TEST_F(StringsReplaceTest, Replace) auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream()); auto const repl = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); cudf::strings::replace(view, target, repl, -1, cudf::test::get_default_stream()); - cudf::strings::replace(view, view, view, cudf::test::get_default_stream()); + cudf::strings::replace_multiple(view, view, view, cudf::test::get_default_stream()); cudf::strings::replace_slice(view, repl, 1, 2, cudf::test::get_default_stream()); auto const pattern = std::string("[a-z]"); diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index 726d9f95c7d..ef4f3bc2b2a 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -277,6 +277,23 @@ TEST_F(StringsReplaceTest, ReplaceErrors) EXPECT_THROW(cudf::strings::replace(sv, target, null_input), cudf::logic_error); EXPECT_THROW(cudf::strings::replace(sv, null_input, replacement), cudf::logic_error); EXPECT_THROW(cudf::strings::replace(sv, empty_input, replacement), cudf::logic_error); + + auto const empty = cudf::test::strings_column_wrapper(); + auto const ev = cudf::strings_column_view(empty); + auto const targets = cudf::test::strings_column_wrapper({"x"}); + auto const tv = cudf::strings_column_view(targets); + auto const target_null = cudf::test::strings_column_wrapper({""}, {0}); + auto const tv_null = cudf::strings_column_view(target_null); + auto const repls = cudf::test::strings_column_wrapper({"y", "z"}); + auto const rv = cudf::strings_column_view(repls); + auto const repl_null = cudf::test::strings_column_wrapper({""}, {0}); + auto const rv_null = cudf::strings_column_view(repl_null); + + EXPECT_THROW(cudf::strings::replace_multiple(sv, ev, rv), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_multiple(sv, tv_null, rv), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, ev), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, rv_null), cudf::logic_error); + EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, rv), cudf::logic_error); } TEST_F(StringsReplaceTest, ReplaceSlice) @@ -341,7 +358,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti) cudf::test::strings_column_wrapper repls({"_ ", "A ", "2 "}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); std::vector h_expected{"_ quick brown fox jumps over _ lazy dog", "_ fat cat lays next 2 _ other accénted cat", @@ -361,7 +378,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti) cudf::test::strings_column_wrapper repls({"* "}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); std::vector h_expected{"* quick brown fox jumps over * lazy dog", "* fat cat lays next * * other accénted cat", @@ -422,7 +439,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) cudf::test::strings_column_wrapper repls({"x", "PEAR", "avocado", "$$"}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); cudf::test::strings_column_wrapper expected( {"This string needs to be very long to trigger the long-replace internal functions. " @@ -454,7 +471,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) cudf::test::strings_column_wrapper repls({"*"}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); cudf::test::strings_column_wrapper expected( {"This string needs to be very long to trigger the long-replace internal functions. " @@ -494,7 +511,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) auto repls = cudf::test::strings_column_wrapper({""}); auto repls_view = cudf::strings_column_view(repls); - auto results = cudf::strings::replace(strings_view, targets_view, repls_view); + auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view); cudf::test::strings_column_wrapper expected( {"This string needs to be very long to trigger the long-replace internal functions. " @@ -522,6 +539,10 @@ TEST_F(StringsReplaceTest, EmptyStringsColumn) auto strings_view = cudf::strings_column_view(zero_size_strings_column); auto results = cudf::strings::replace( strings_view, cudf::string_scalar("not"), cudf::string_scalar("pertinent")); - auto view = results->view(); + cudf::test::expect_column_empty(results->view()); + + auto const target = cudf::test::strings_column_wrapper({"x"}); + auto const target_view = cudf::strings_column_view(target); + results = cudf::strings::replace_multiple(strings_view, target_view, target_view); cudf::test::expect_column_empty(results->view()); } diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 086d4672788..8487fb6dc91 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1755,7 +1755,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti( cudf::strings_column_view scvtargets(*cvtargets); cudf::column_view* cvrepls = reinterpret_cast(repls_cv); cudf::strings_column_view scvrepls(*cvrepls); - return release_as_jlong(cudf::strings::replace(scv, scvtargets, scvrepls)); + return release_as_jlong(cudf::strings::replace_multiple(scv, scvtargets, scvrepls)); } CATCH_STD(env, 0); } diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd index 92e142b33fc..34e03eec638 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd @@ -23,7 +23,7 @@ cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil: string_scalar repl, int32_t maxrepl) except + - cdef unique_ptr[column] replace( + cdef unique_ptr[column] replace_multiple( column_view source_strings, column_view target_strings, column_view repl_strings) except + diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx index 880201e65a2..2d9330a8a24 100644 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ b/python/cudf/cudf/_lib/strings/replace.pyx @@ -12,6 +12,7 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( replace as cpp_replace, + replace_multiple as cpp_replace_multiple, replace_slice as cpp_replace_slice, ) from cudf._lib.pylibcudf.libcudf.types cimport size_type @@ -126,7 +127,7 @@ def replace_multi(Column source_strings, cdef column_view repl_view = repl_strings.view() with nogil: - c_result = move(cpp_replace( + c_result = move(cpp_replace_multiple( source_view, target_view, repl_view From f30ea0a7d12625a755bb5726e7514dfdf12094d6 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 3 Jun 2024 17:37:56 -0400 Subject: [PATCH 035/340] Use offsetalator in strings shift functor (#15870) Replaces hardcoded `size_type` used for offset values in the `shift_chars_fn` functor with offsetalator. Follow on to #15630 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15870 --- cpp/src/strings/copying/shift.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index 5bba4855390..b386c0860d1 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -67,9 +67,9 @@ struct shift_chars_fn { if (offset < 0) { auto const last_index = -offset; if (idx < last_index) { - auto const first_index = - offset + d_column.child(strings_column_view::offsets_column_index) - .element(d_column.offset() + d_column.size()); + auto const offsets = d_column.child(strings_column_view::offsets_column_index); + auto const off_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + auto const first_index = offset + off_itr[d_column.offset() + d_column.size()]; return d_column.head()[idx + first_index]; } else { auto const char_index = idx - last_index; @@ -79,9 +79,9 @@ struct shift_chars_fn { if (idx < offset) { return d_filler.data()[idx % d_filler.size_bytes()]; } else { - return d_column.head()[idx - offset + - d_column.child(strings_column_view::offsets_column_index) - .element(d_column.offset())]; + auto const offsets = d_column.child(strings_column_view::offsets_column_index); + auto const off_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + return d_column.head()[idx - offset + off_itr[d_column.offset()]]; } } } From 90b3094f8a5a12b029a156cf484b673b589d2fec Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:52:46 -0700 Subject: [PATCH 036/340] Clean up pylibcudf test assertations (#15892) Swap the order of result,expected to expected, result for assert_table_eq too Fix a few places where result,expected was swapped for assert_column_eq Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15892 --- python/cudf/cudf/pylibcudf_tests/common/utils.py | 2 +- python/cudf/cudf/pylibcudf_tests/test_copying.py | 14 +++++++------- python/cudf/cudf/pylibcudf_tests/test_reshape.py | 4 ++-- .../cudf/pylibcudf_tests/test_string_capitalize.py | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 0befb3bb3e8..e00053529a8 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -54,7 +54,7 @@ def assert_column_eq( assert lhs.equals(rhs) -def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None: +def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: """Verify that a pylibcudf table and PyArrow table are equal.""" plc_shape = (plc_table.num_rows(), plc_table.num_columns()) assert plc_shape == pa_table.shape diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index ef70869a145..cd70ce4abf5 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -138,7 +138,7 @@ def test_gather(target_table, pa_target_table, index_column, pa_index_column): plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) expected = pa_target_table.take(pa_index_column) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_gather_map_has_nulls(target_table): @@ -240,7 +240,7 @@ def test_scatter_table( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_scatter_table_num_col_mismatch( @@ -315,7 +315,7 @@ def test_scatter_scalars( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_scatter_scalars_num_scalars_mismatch( @@ -574,7 +574,7 @@ def test_slice_table(target_table, pa_target_table): lower_bounds = bounds[::2] result = plc.copying.slice(target_table, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): - assert_table_eq(slice_, pa_target_table[lb:ub]) + assert_table_eq(pa_target_table[lb:ub], slice_) def test_split_column(target_column, pa_target_column): @@ -600,7 +600,7 @@ def test_split_table(target_table, pa_target_table): lower_bounds = [0] + upper_bounds[:-1] result = plc.copying.split(target_table, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): - assert_table_eq(split, pa_target_table[lb:ub]) + assert_table_eq(pa_target_table[lb:ub], split) def test_copy_if_else_column_column( @@ -753,7 +753,7 @@ def test_boolean_mask_scatter_from_table( pa_source_table, pa_mask, pa_target_table ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table): @@ -828,7 +828,7 @@ def test_boolean_mask_scatter_from_scalars( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_get_element(input_column, pa_input_column): diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py index b8b914f3f09..32d79257f4f 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py +++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py @@ -27,7 +27,7 @@ def test_interleave_columns(reshape_data, reshape_plc_tbl): expect = pa.concat_arrays(interleaved_data) - assert_column_eq(res, expect) + assert_column_eq(expect, res) @pytest.mark.parametrize("cnt", [0, 1, 3]) @@ -40,4 +40,4 @@ def test_tile(reshape_data, reshape_plc_tbl, cnt): tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema ) - assert_table_eq(res, expect) + assert_table_eq(expect, res) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index dd7e96e871b..818d6e6e72a 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -37,7 +37,7 @@ def plc_data(pa_data): def test_capitalize(plc_data, pa_data): got = plc.strings.capitalize.capitalize(plc_data) expected = pa.compute.utf8_capitalize(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_title(plc_data, pa_data): @@ -45,10 +45,10 @@ def test_title(plc_data, pa_data): plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES ) expected = pa.compute.utf8_title(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_is_title(plc_data, pa_data): got = plc.strings.capitalize.is_title(plc_data) expected = pa.compute.utf8_is_title(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) From 6176776e1f88718d802b317f506e2b56635fa31a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 3 Jun 2024 15:06:39 -0700 Subject: [PATCH 037/340] Improve options docs (#15888) Recently I have answered a few user questions about how to use cudf options for display. We were missing documentation that explained that display options are inherited from pandas. I also found a broken link in the docs. This PR fixes both of those doc-related issues. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15888 --- docs/cudf/source/cudf_pandas/how-it-works.md | 5 ++--- docs/cudf/source/user_guide/api_docs/options.rst | 13 +++++++++++++ docs/cudf/source/user_guide/options.md | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/cudf/source/cudf_pandas/how-it-works.md b/docs/cudf/source/cudf_pandas/how-it-works.md index ee856c84b78..75f57742ac9 100644 --- a/docs/cudf/source/cudf_pandas/how-it-works.md +++ b/docs/cudf/source/cudf_pandas/how-it-works.md @@ -34,6 +34,5 @@ correct result. Data is automatically transferred from host to device transfers. When using `cudf.pandas`, cuDF's [pandas compatibility -mode](https://docs.rapids.ai/api/cudf/stable/api_docs/options/#available-options) -is automatically enabled, ensuring consistency with pandas-specific -semantics like default sort ordering. +mode](api.options) is automatically enabled, ensuring consistency with +pandas-specific semantics like default sort ordering. diff --git a/docs/cudf/source/user_guide/api_docs/options.rst b/docs/cudf/source/user_guide/api_docs/options.rst index b3a4004e2d9..4c0f6684b76 100644 --- a/docs/cudf/source/user_guide/api_docs/options.rst +++ b/docs/cudf/source/user_guide/api_docs/options.rst @@ -12,6 +12,19 @@ Options and settings cudf.describe_option cudf.option_context +Display options are controlled by pandas +---------------------------------------- + +Options for display are inherited from pandas. This includes commonly accessed options such as: + +- ``display.max_columns`` +- ``display.max_info_rows`` +- ``display.max_rows`` +- ``display.max_seq_items`` + +For example, to show all rows of a DataFrame or Series in a Jupyter notebook, call ``pandas.set_option("display.max_rows", None)``. + +See also the :ref:`full list of pandas display options `. Available options ----------------- diff --git a/docs/cudf/source/user_guide/options.md b/docs/cudf/source/user_guide/options.md index 245d3fd1974..997681212fb 100644 --- a/docs/cudf/source/user_guide/options.md +++ b/docs/cudf/source/user_guide/options.md @@ -11,4 +11,4 @@ When no argument is provided, all options are printed. To set value to a option, use {py:func}`cudf.set_option`. -See the [API reference](api.options) for more details. +See the [options API reference](api.options) for descriptions of the available options. From 4a0b59133ed56c043fc73d24785f24be0b4fbe69 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 Jun 2024 15:08:31 -0700 Subject: [PATCH 038/340] Update Python labels and remove unnecessary ones (#15893) This PR leverages some of the new labels we have for organizing our issues and removes labels that aren't really used at the moment. If reviewers feel strongly I can keep the ci label, but AFAICT that doesn't really get used for anything at the moment and we'll benefit more from leveraging future labels to help direct tasks to the build/infra team vs cudf devs. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15893 --- .github/labeler.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index d14344384d1..48967417af3 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,9 +1,19 @@ # Documentation for config - https://github.com/actions/labeler#common-examples -cuDF (Python): +Python: - 'python/**' - 'notebooks/**' +cudf.pandas: + - 'python/cudf/cudf/pandas/**' + - 'python/cudf/cudf_pandas_tests/**' + +cudf.polars: + - 'python/cudf_polars/**' + +pylibcudf: + - 'python/cudf/cudf/_lib/pylibcudf/**' + libcudf: - 'cpp/**' @@ -12,11 +22,5 @@ CMake: - '**/cmake/**' - '**/*.cmake' -cuDF (Java): +Java: - 'java/**' - -ci: - - 'ci/**' - -conda: - - 'conda/**' From 382de32e8137a3a59a0800f46ef8a1de62b1a6e5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 Jun 2024 15:14:52 -0700 Subject: [PATCH 039/340] Add support for additional metaclasses of proxies and use for ExcelWriter (#15399) The ExcelWriter supports the abstract os.PathLike interface, but we would also like that support to be reflected in the class's MRO. Doing so is slightly complicated because os.PathLike is an ABC, and as such has a different metaclass. Therefore, in order to add os.PathLike as a base class, we must also generate a suitable combined metaclass for our ExcelWriter wrapper. This change ensures the `isinstance(pd.ExcelWriter(...), os.PathLike)` returns `True` when using cudf.pandas. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15399 --- python/cudf/cudf/pandas/_wrappers/pandas.py | 11 +++++-- python/cudf/cudf/pandas/fast_slow_proxy.py | 30 +++++++------------ .../cudf_pandas_tests/test_cudf_pandas.py | 5 ++++ 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 2e3880e14f6..698dd946022 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1,8 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import abc import copyreg import importlib +import os import pickle import sys @@ -857,7 +859,12 @@ def Index__new__(cls, *args, **kwargs): pd.ExcelWriter, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, + additional_attributes={ + "__hash__": _FastSlowAttribute("__hash__"), + "__fspath__": _FastSlowAttribute("__fspath__"), + }, + bases=(os.PathLike,), + metaclasses=(abc.ABCMeta,), ) try: @@ -1032,7 +1039,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs): fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, - meta_class=pd_HolidayCalendarMetaClass, + metaclasses=(pd_HolidayCalendarMetaClass,), ) Holiday = make_final_proxy_type( diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 94caec1ce6c..169dd80e132 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -106,19 +106,6 @@ def __call__(self): _DELETE = object() -def create_composite_metaclass(base_meta, additional_meta): - """ - Dynamically creates a composite metaclass that inherits from both provided metaclasses. - This ensures that the metaclass behaviors of both base_meta and additional_meta are preserved. - """ - - class CompositeMeta(base_meta, additional_meta): - def __new__(cls, name, bases, namespace): - return super().__new__(cls, name, bases, namespace) - - return CompositeMeta - - def make_final_proxy_type( name: str, fast_type: type, @@ -130,7 +117,7 @@ def make_final_proxy_type( additional_attributes: Mapping[str, Any] | None = None, postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None, bases: Tuple = (), - meta_class=None, + metaclasses: Tuple = (), ) -> Type[_FinalProxy]: """ Defines a fast-slow proxy type for a pair of "final" fast and slow @@ -161,6 +148,8 @@ def make_final_proxy_type( construct said unwrapped object. See also `_maybe_wrap_result`. bases Optional tuple of base classes to insert into the mro. + metaclasses + Optional tuple of metaclasses to unify with the base proxy metaclass. Notes ----- @@ -241,15 +230,18 @@ def _fsproxy_state(self) -> _State: cls_dict[slow_name] = _FastSlowAttribute( slow_name, private=slow_name.startswith("_") ) - if meta_class is None: - meta_class = _FastSlowProxyMeta - else: - meta_class = create_composite_metaclass(_FastSlowProxyMeta, meta_class) + metaclass = _FastSlowProxyMeta + if metaclasses: + metaclass = types.new_class( # type: ignore + f"{name}_Meta", + metaclasses + (_FastSlowProxyMeta,), + {}, + ) cls = types.new_class( name, (*bases, _FinalProxy), - {"metaclass": meta_class}, + {"metaclass": metaclass}, lambda ns: ns.update(cls_dict), ) functools.update_wrapper( diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 75bceea3034..fef829b17fc 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -6,6 +6,7 @@ import copy import datetime import operator +import os import pathlib import pickle import tempfile @@ -1421,3 +1422,7 @@ def test_holidays_within_dates(holiday, start, expected): utc.localize(xpd.Timestamp(start)), ) ) == [utc.localize(dt) for dt in expected] + + +def test_excelwriter_pathlike(): + assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike) From eb460169786665b1624cb6c4f9b502b800810b37 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 4 Jun 2024 06:32:49 -0500 Subject: [PATCH 040/340] Migrate column factories to pylibcudf (#15257) This PR implements `column_factories.hpp` using `pylibcudf` and migrates the cuDF cython to use them cc @vyasr Authors: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15257 --- cpp/src/column/column_factories.cpp | 17 +- cpp/tests/column/factories_test.cpp | 4 +- cpp/tests/fixed_point/fixed_point_tests.cpp | 2 +- .../api_docs/pylibcudf/column_factories.rst | 6 + .../user_guide/api_docs/pylibcudf/index.rst | 1 + python/cudf/cudf/_lib/column.pyx | 21 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 4 +- .../cudf/_lib/pylibcudf/column_factories.pxd | 52 ++++ .../cudf/_lib/pylibcudf/column_factories.pyx | 205 ++++++++++++++ python/cudf/cudf/_lib/pylibcudf/interop.pyx | 82 ++++++ .../libcudf/column/column_factories.pxd | 73 ++++- python/cudf/cudf/_lib/pylibcudf/types.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/types.pyx | 3 +- .../pylibcudf_tests/test_column_factories.py | 253 ++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_interop.py | 69 +++++ 17 files changed, 767 insertions(+), 29 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_column_factories.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_interop.py diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index e40056fc8a1..0260068d4db 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -65,7 +65,8 @@ std::size_t size_of(data_type element_type) std::unique_ptr make_empty_column(data_type type) { CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type), - "make_empty_column is invalid to call on nested types"); + "make_empty_column is invalid to call on nested types", + cudf::data_type_error); return std::make_unique(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); } @@ -80,7 +81,9 @@ std::unique_ptr make_numeric_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); + CUDF_EXPECTS(type.id() != type_id::EMPTY && is_numeric(type), + "Invalid, non-numeric type.", + cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -100,7 +103,7 @@ std::unique_ptr make_fixed_point_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type."); + CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -120,7 +123,7 @@ std::unique_ptr make_timestamp_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); + CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -140,7 +143,7 @@ std::unique_ptr make_duration_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); + CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -160,7 +163,9 @@ std::unique_ptr make_fixed_width_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type."); + CUDF_EXPECTS(type.id() != type_id::EMPTY && is_fixed_width(type), + "Invalid, non-fixed-width type.", + cudf::data_type_error); // clang-format off if (is_timestamp (type)) return make_timestamp_column (type, size, state, stream, mr); diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index afebc91dd73..dca36eaa4e7 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -164,7 +164,7 @@ TEST_P(NonNumericFactoryTest, NonNumericThrow) auto column = cudf::make_numeric_column( cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED); }; - EXPECT_THROW(construct(), cudf::logic_error); + EXPECT_THROW(construct(), cudf::data_type_error); } INSTANTIATE_TEST_CASE_P(NonNumeric, @@ -307,7 +307,7 @@ TEST_P(NonFixedWidthFactoryTest, NonFixedWidthThrow) auto column = cudf::make_fixed_width_column( cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED); }; - EXPECT_THROW(construct(), cudf::logic_error); + EXPECT_THROW(construct(), cudf::data_type_error); } INSTANTIATE_TEST_CASE_P(NonFixedWidth, diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index 73de1fbaa68..ab7984d4b03 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper) TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID) { EXPECT_THROW(cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0), - cudf::logic_error); + cudf::data_type_error); } TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst new file mode 100644 index 00000000000..c858135b6ce --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst @@ -0,0 +1,6 @@ +================ +column_factories +================ + +.. automodule:: cudf._lib.pylibcudf.column_factories + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 26875ce7d12..58fea77adaa 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -11,6 +11,7 @@ This page provides API documentation for pylibcudf. aggregation binaryop column + column_factories concatenate copying filling diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f33e121241d..7155017b7af 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -39,14 +39,10 @@ from cudf._lib.types cimport ( from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column -# TODO: We currently need this for "casting" empty pylibcudf columns in -# from_pylibcudf by instead creating an empty numeric column. We will be able -# to remove this once column factories are exposed to pylibcudf. cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary -from cudf._lib.pylibcudf cimport Column as plc_Column from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( make_column_from_scalar as cpp_make_column_from_scalar, @@ -623,22 +619,17 @@ cdef class Column: pylibcudf.Column A new pylibcudf.Column referencing the same data. """ - cdef libcudf_types.data_type new_dtype if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS: col = pylibcudf.unary.cast( col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS) ) elif col.type().id() == pylibcudf.TypeId.EMPTY: - new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8) - # TODO: This function call is what requires cimporting pylibcudf. - # We can remove the cimport once we can directly do - # pylibcudf.column_factories.make_numeric_column or equivalent. - col = plc_Column.from_libcudf( - move( - make_numeric_column( - new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL - ) - ) + new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8) + + col = pylibcudf.column_factories.make_numeric_column( + new_dtype, + col.size(), + pylibcudf.column_factories.MaskState.ALL_NULL ) dtype = dtype_from_pylibcudf_column(col) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index eff14ad549b..7d0676f6def 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -16,6 +16,7 @@ set(cython_sources aggregation.pyx binaryop.pyx column.pyx + column_factories.pyx concatenate.pyx copying.pyx filling.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 4f77f8cbaef..b289d112a90 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -4,6 +4,7 @@ from . cimport ( aggregation, binaryop, + column_factories, concatenate, copying, filling, @@ -40,6 +41,7 @@ __all__ = [ "binaryop", "concatenate", "copying", + "column_factories", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 048b62b6013..2565332f3ed 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -3,6 +3,7 @@ from . import ( aggregation, binaryop, + column_factories, concatenate, copying, filling, @@ -27,7 +28,7 @@ from .gpumemoryview import gpumemoryview from .scalar import Scalar from .table import Table -from .types import DataType, TypeId +from .types import DataType, MaskState, TypeId __all__ = [ "Column", @@ -39,6 +40,7 @@ "binaryop", "concatenate", "copying", + "column_factories", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd new file mode 100644 index 00000000000..9dbd74ab16c --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type + +from .column cimport Column +from .types cimport DataType, size_type, type_id + +ctypedef fused MakeEmptyColumnOperand: + DataType + type_id + object + +ctypedef fused MaskArg: + mask_state + object + + +cpdef Column make_empty_column( + MakeEmptyColumnOperand type_or_id +) + +cpdef Column make_numeric_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_fixed_point_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_timestamp_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_duration_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_fixed_width_column( + DataType type_, + size_type size, + MaskArg mask, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx new file mode 100644 index 00000000000..ef7f512f0e5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx @@ -0,0 +1,205 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( + make_duration_column as cpp_make_duration_column, + make_empty_column as cpp_make_empty_column, + make_fixed_point_column as cpp_make_fixed_point_column, + make_fixed_width_column as cpp_make_fixed_width_column, + make_numeric_column as cpp_make_numeric_column, + make_timestamp_column as cpp_make_timestamp_column, +) +from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type + +from .types cimport DataType, type_id + +from .types import MaskState, TypeId + + +cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): + cdef unique_ptr[column] result + cdef type_id id + + if MakeEmptyColumnOperand is object: + if isinstance(type_or_id, TypeId): + id = type_or_id + with nogil: + result = move( + cpp_make_empty_column( + id + ) + ) + else: + raise TypeError( + "Must pass a TypeId or DataType" + ) + elif MakeEmptyColumnOperand is DataType: + with nogil: + result = move( + cpp_make_empty_column( + type_or_id.c_obj + ) + ) + elif MakeEmptyColumnOperand is type_id: + with nogil: + result = move( + cpp_make_empty_column( + type_or_id + ) + ) + else: + raise TypeError( + "Must pass a TypeId or DataType" + ) + return Column.from_libcudf(move(result)) + + +cpdef Column make_numeric_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_numeric_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + +cpdef Column make_fixed_point_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_fixed_point_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_timestamp_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_timestamp_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_duration_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_duration_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_fixed_width_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_fixed_width_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index f172080cece..1e4102e4b64 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -33,6 +33,33 @@ from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +ARROW_TO_PYLIBCUDF_TYPES = { + pa.int8(): type_id.INT8, + pa.int16(): type_id.INT16, + pa.int32(): type_id.INT32, + pa.int64(): type_id.INT64, + pa.uint8(): type_id.UINT8, + pa.uint16(): type_id.UINT16, + pa.uint32(): type_id.UINT32, + pa.uint64(): type_id.UINT64, + pa.float32(): type_id.FLOAT32, + pa.float64(): type_id.FLOAT64, + pa.bool_(): type_id.BOOL8, + pa.string(): type_id.STRING, + pa.duration('s'): type_id.DURATION_SECONDS, + pa.duration('ms'): type_id.DURATION_MILLISECONDS, + pa.duration('us'): type_id.DURATION_MICROSECONDS, + pa.duration('ns'): type_id.DURATION_NANOSECONDS, + pa.timestamp('s'): type_id.TIMESTAMP_SECONDS, + pa.timestamp('ms'): type_id.TIMESTAMP_MILLISECONDS, + pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS, + pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS, + pa.date32(): type_id.TIMESTAMP_DAYS, +} + +LIBCUDF_TO_ARROW_TYPES = { + v: k for k, v in ARROW_TO_PYLIBCUDF_TYPES.items() +} cdef column_metadata _metadata_to_libcudf(metadata): """Convert a ColumnMetadata object to C++ column_metadata. @@ -77,6 +104,21 @@ def from_arrow(pyarrow_object, *, DataType data_type=None): raise TypeError("from_arrow only accepts Table and Scalar objects") +@from_arrow.register(pa.DataType) +def _from_arrow_datatype(pyarrow_object): + if isinstance(pyarrow_object, pa.Decimal128Type): + return DataType(type_id.DECIMAL128, scale=-pyarrow_object.scale) + elif isinstance(pyarrow_object, pa.StructType): + return DataType(type_id.STRUCT) + elif isinstance(pyarrow_object, pa.ListType): + return DataType(type_id.LIST) + else: + try: + return DataType(ARROW_TO_PYLIBCUDF_TYPES[pyarrow_object]) + except KeyError: + raise TypeError(f"Unable to convert {pyarrow_object} to cudf datatype") + + @from_arrow.register(pa.Table) def _from_arrow_table(pyarrow_object, *, DataType data_type=None): if data_type is not None: @@ -170,6 +212,46 @@ def to_arrow(cudf_object, metadata=None): raise TypeError("to_arrow only accepts Table and Scalar objects") +@to_arrow.register(DataType) +def _to_arrow_datatype(cudf_object, **kwargs): + """ + Convert a datatype to arrow. + + Translation of some types requires extra information as a keyword + argument. Specifically: + + - When translating a decimal type, provide ``precision`` + - When translating a struct type, provide ``fields`` + - When translating a list type, provide the wrapped ``value_type`` + """ + if cudf_object.id() in {type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128}: + if not (precision := kwargs.get("precision")): + raise ValueError( + "Precision must be provided for decimal types" + ) + # no pa.decimal32 or pa.decimal64 + return pa.decimal128(precision, -cudf_object.scale()) + elif cudf_object.id() == type_id.STRUCT: + if not (fields := kwargs.get("fields")): + raise ValueError( + "Fields must be provided for struct types" + ) + return pa.struct(fields) + elif cudf_object.id() == type_id.LIST: + if not (value_type := kwargs.get("value_type")): + raise ValueError( + "Value type must be provided for list types" + ) + return pa.list_(value_type) + else: + try: + return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()] + except KeyError: + raise TypeError( + f"Unable to convert {cudf_object.id()} to arrow datatype" + ) + + @to_arrow.register(Table) def _to_arrow_table(cudf_object, metadata=None): if metadata is None: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd index fd22d92cb30..2faff21a77b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd @@ -2,9 +2,17 @@ from libcpp.memory cimport unique_ptr +from rmm._lib.device_buffer cimport device_buffer + from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar -from cudf._lib.pylibcudf.libcudf.types cimport data_type, mask_state, size_type +from cudf._lib.pylibcudf.libcudf.types cimport ( + bitmask_type, + data_type, + mask_state, + size_type, + type_id, +) cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: @@ -12,5 +20,64 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: size_type size, mask_state state) except + - cdef unique_ptr[column] make_column_from_scalar (const scalar & s, - size_type size) except + + cdef unique_ptr[column] make_numeric_column(data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_fixed_point_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_fixed_point_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_timestamp_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_timestamp_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_duration_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_duration_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_fixed_width_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_fixed_width_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_column_from_scalar(const scalar& s, + size_type size) except + + + cdef unique_ptr[column] make_dictionary_from_scalar(const scalar& s, + size_type size) except + + + cdef unique_ptr[column] make_empty_column(type_id id) except + + cdef unique_ptr[column] make_empty_column(data_type type_) except + + + cdef unique_ptr[column] make_dictionary_column( + unique_ptr[column] keys_column, + unique_ptr[column] indices_column) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index e54a259819e..7d3ddca14a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -13,6 +13,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( null_order, null_policy, order, + size_type, sorted, type_id, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index a5248ad0a1f..6dbb287f3c4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -8,6 +8,7 @@ from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId # no-cython-lin from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation # no-cython-lint, isort:skip +from cudf._lib.pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, isort:skip @@ -22,7 +23,7 @@ cdef class DataType: Parameters ---------- - id : TypeId + id : type_id The type's identifier scale : int The scale associated with the data. Only used for decimal data types. diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py new file mode 100644 index 00000000000..4c05770a41f --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py @@ -0,0 +1,253 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq + +from cudf._lib import pylibcudf as plc + +EMPTY_COL_SIZE = 3 + +NUMERIC_TYPES = [ + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.float32(), + pa.float64(), + pa.bool_(), +] + +TIMESTAMP_TYPES = [ + pa.timestamp("s"), + pa.timestamp("ms"), + pa.timestamp("us"), + pa.timestamp("ns"), +] + +DURATION_TYPES = [ + pa.duration("s"), + pa.duration("ms"), + pa.duration("us"), + pa.duration("ns"), +] + +DECIMAL_TYPES = [pa.decimal128(38, 2)] + +STRING_TYPES = [pa.string()] +STRUCT_TYPES = [DEFAULT_STRUCT_TESTING_TYPE] +LIST_TYPES = [pa.list_(pa.int64())] + +ALL_TYPES = ( + NUMERIC_TYPES + + TIMESTAMP_TYPES + + DURATION_TYPES + + STRING_TYPES + + DECIMAL_TYPES + + STRUCT_TYPES + + LIST_TYPES +) + + +@pytest.fixture(scope="module", params=NUMERIC_TYPES, ids=repr) +def numeric_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=DECIMAL_TYPES, + ids=repr, +) +def fixed_point_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=TIMESTAMP_TYPES, + ids=repr, +) +def timestamp_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=DURATION_TYPES, + ids=repr, +) +def duration_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + plc.MaskState.UNALLOCATED, + plc.MaskState.ALL_VALID, + plc.MaskState.ALL_NULL, + plc.MaskState.UNINITIALIZED, + ], + ids=["unallocated", "all_valid", "all_null", "uninitialized"], +) +def mask_state(request): + return request.param + + +def test_make_empty_column_dtype(pa_type): + pa_col = pa.array([], type=pa_type) + + plc_type = plc.interop.from_arrow(pa_col).type() + + if isinstance(pa_type, (pa.ListType, pa.StructType)): + with pytest.raises(ValueError): + plc.column_factories.make_empty_column(plc_type) + return + + cudf_col = plc.column_factories.make_empty_column(plc_type) + assert_column_eq(cudf_col, pa_col) + + +def test_make_empty_column_typeid(pa_type): + pa_col = pa.array([], type=pa_type) + + tid = plc.interop.from_arrow(pa_col).type().id() + + if isinstance(pa_type, (pa.ListType, pa.StructType)): + with pytest.raises(ValueError): + plc.column_factories.make_empty_column(tid) + return + + cudf_col = plc.column_factories.make_empty_column(tid) + assert_column_eq(cudf_col, pa_col) + + +def validate_empty_column(col, mask_state, dtype): + assert col.size() == EMPTY_COL_SIZE + + if mask_state == plc.types.MaskState.UNALLOCATED: + assert col.null_count() == 0 + elif mask_state == plc.types.MaskState.ALL_VALID: + assert col.null_count() == 0 + elif mask_state == plc.types.MaskState.ALL_NULL: + assert col.null_count() == EMPTY_COL_SIZE + + assert plc.interop.to_arrow(col).type == dtype + + +def test_make_numeric_column(numeric_pa_type, mask_state): + plc_type = plc.interop.from_arrow(numeric_pa_type) + + got = plc.column_factories.make_numeric_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, numeric_pa_type) + + +@pytest.mark.parametrize( + "non_numeric_pa_type", [t for t in ALL_TYPES if t not in NUMERIC_TYPES] +) +def test_make_numeric_column_dtype_err(non_numeric_pa_type): + plc_type = plc.interop.from_arrow(non_numeric_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_numeric_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_numeric_column_negative_size_err(numeric_pa_type): + plc_type = plc.interop.from_arrow(numeric_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_numeric_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_fixed_point_column(fixed_point_pa_type, mask_state): + plc_type = plc.interop.from_arrow(fixed_point_pa_type) + + got = plc.column_factories.make_fixed_point_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + + validate_empty_column(got, mask_state, fixed_point_pa_type) + + +@pytest.mark.parametrize( + "non_fixed_point_pa_type", [t for t in ALL_TYPES if t not in DECIMAL_TYPES] +) +def test_make_fixed_point_column_dtype_err(non_fixed_point_pa_type): + plc_type = plc.interop.from_arrow(non_fixed_point_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_fixed_point_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_fixed_point_column_negative_size_err(fixed_point_pa_type): + plc_type = plc.interop.from_arrow(fixed_point_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_fixed_point_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_timestamp_column(timestamp_pa_type, mask_state): + plc_type = plc.interop.from_arrow(timestamp_pa_type) + + got = plc.column_factories.make_timestamp_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, timestamp_pa_type) + + +@pytest.mark.parametrize( + "non_timestamp_pa_type", [t for t in ALL_TYPES if t not in TIMESTAMP_TYPES] +) +def test_make_timestamp_column_dtype_err(non_timestamp_pa_type): + plc_type = plc.interop.from_arrow(non_timestamp_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_timestamp_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_timestamp_column_negative_size_err(timestamp_pa_type): + plc_type = plc.interop.from_arrow(timestamp_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_timestamp_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_duration_column(duration_pa_type, mask_state): + plc_type = plc.interop.from_arrow(duration_pa_type) + + got = plc.column_factories.make_duration_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, duration_pa_type) + + +@pytest.mark.parametrize( + "non_duration_pa_type", [t for t in ALL_TYPES if t not in DURATION_TYPES] +) +def test_make_duration_column_dtype_err(non_duration_pa_type): + plc_type = plc.interop.from_arrow(non_duration_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_duration_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_duration_column_negative_size_err(duration_pa_type): + plc_type = plc.interop.from_arrow(duration_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_duration_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_interop.py b/python/cudf/cudf/pylibcudf_tests/test_interop.py new file mode 100644 index 00000000000..5c05f460e28 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_interop.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest + +import cudf._lib.pylibcudf as plc + + +def test_list_dtype_roundtrip(): + list_type = pa.list_(pa.int32()) + plc_type = plc.interop.from_arrow(list_type) + + assert plc_type == plc.types.DataType(plc.types.TypeId.LIST) + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, value_type=list_type.value_type + ) + assert arrow_type == list_type + + +def test_struct_dtype_roundtrip(): + struct_type = pa.struct([("a", pa.int32()), ("b", pa.string())]) + plc_type = plc.interop.from_arrow(struct_type) + + assert plc_type == plc.types.DataType(plc.types.TypeId.STRUCT) + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, + fields=[struct_type.field(i) for i in range(struct_type.num_fields)], + ) + assert arrow_type == struct_type + + +def test_decimal128_roundtrip(): + decimal_type = pa.decimal128(10, 2) + plc_type = plc.interop.from_arrow(decimal_type) + + assert plc_type.id() == plc.types.TypeId.DECIMAL128 + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, precision=decimal_type.precision + ) + assert arrow_type == decimal_type + + +@pytest.mark.parametrize( + "data_type", + [ + plc.types.DataType(plc.types.TypeId.DECIMAL32), + plc.types.DataType(plc.types.TypeId.DECIMAL64), + ], +) +def test_decimal_other(data_type): + precision = 3 + + with pytest.raises(ValueError): + plc.interop.to_arrow(data_type) + + arrow_type = plc.interop.to_arrow(data_type, precision=precision) + assert arrow_type == pa.decimal128(precision, 0) From fc31aa3c4f99d6348e7c32a3e3c52c68b26ca700 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 4 Jun 2024 10:19:30 -0400 Subject: [PATCH 041/340] Add overflow check when converting large strings to lists columns (#15887) Fixes a couple places where strings columns are converted to lists column as binary -- chars are represented as INT8. Since lists columns only support `size_type` offsets type, this change will throw an error if the size of the chars exceeds max `size_type` values. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/15887 --- cpp/src/io/utilities/column_buffer.cpp | 4 ++++ cpp/src/reshape/byte_cast.cu | 11 ++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index e5d4e1a360f..27fc53fbc9e 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -191,6 +191,10 @@ std::unique_ptr make_column(column_buffer_base& buffer, auto data = col_content.data.release(); auto char_size = data->size(); + CUDF_EXPECTS(char_size < static_cast(std::numeric_limits::max()), + "Cannot convert strings column to lists column due to size_type limit", + std::overflow_error); + auto uint8_col = std::make_unique( data_type{type_id::UINT8}, char_size, std::move(*data), rmm::device_buffer{}, 0); diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 1b05a9744fa..3dfa0b65814 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -135,9 +135,14 @@ struct byte_list_conversion_fn(input, stream, mr)->release(); - auto const num_chars = col_content.data->size(); - auto uint8_col = std::make_unique( + auto const num_chars = strings_column_view(input).chars_size(stream); + CUDF_EXPECTS(num_chars < static_cast(std::numeric_limits::max()), + "Cannot convert strings column to lists column due to size_type limit", + std::overflow_error); + + auto col_content = std::make_unique(input, stream, mr)->release(); + + auto uint8_col = std::make_unique( output_type, num_chars, std::move(*(col_content.data)), rmm::device_buffer{}, 0); auto result = make_lists_column( From 54d49fcea4e7ad73df21f0dbfe99097c635b1023 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 4 Jun 2024 16:17:25 +0100 Subject: [PATCH 042/340] Ensure literals have correct dtype (#15890) The polars schema tells us the dtype for any literals, but previously we were relying on pyarrow inference. Add pylibcudf to pyarrow datatype conversion utilities and use the resulting datatypes explicitly. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/15890 --- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 3 +- python/cudf_polars/cudf_polars/dsl/expr.py | 46 +++++++++++++------ python/cudf_polars/cudf_polars/dsl/ir.py | 10 ++-- .../cudf_polars/cudf_polars/dsl/translate.py | 9 ++-- .../cudf_polars/cudf_polars/utils/dtypes.py | 3 ++ python/cudf_polars/pyproject.toml | 2 +- python/cudf_polars/tests/__init__.py | 6 +++ .../cudf_polars/tests/expressions/__init__.py | 6 +++ .../cudf_polars/tests/expressions/test_agg.py | 2 +- .../tests/expressions/test_distinct.py | 36 +++++++++++++++ python/cudf_polars/tests/test_scan.py | 12 +---- 11 files changed, 102 insertions(+), 33 deletions(-) create mode 100644 python/cudf_polars/tests/__init__.py create mode 100644 python/cudf_polars/tests/expressions/__init__.py create mode 100644 python/cudf_polars/tests/expressions/test_distinct.py diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 1e4102e4b64..07e9d1ead11 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -55,6 +55,7 @@ ARROW_TO_PYLIBCUDF_TYPES = { pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS, pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS, pa.date32(): type_id.TIMESTAMP_DAYS, + pa.null(): type_id.EMPTY, } LIBCUDF_TO_ARROW_TYPES = { @@ -245,7 +246,7 @@ def _to_arrow_datatype(cudf_object, **kwargs): return pa.list_(value_type) else: try: - return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()] + return LIBCUDF_TO_ARROW_TYPES[cudf_object.id()] except KeyError: raise TypeError( f"Unable to convert {cudf_object.id()} to arrow datatype" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 249cc3775f7..7187a36f21c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -484,32 +484,48 @@ def do_evaluate( return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsLastDistinct: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsUnique: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsDuplicated: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: name = columns[0].name @@ -717,7 +733,9 @@ def do_evaluate( bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY obj = plc.replace.replace_nulls( indices.obj, - plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()), + plc.interop.from_arrow( + pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type())) + ), ) else: bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK @@ -893,11 +911,13 @@ def _reduce( ) def _count(self, column: Column) -> Column: - # TODO: dtype handling return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(column.obj.size() - column.obj.null_count()), + pa.scalar( + column.obj.size() - column.obj.null_count(), + type=plc.interop.to_arrow(self.dtype), + ), ), 1, ), @@ -909,7 +929,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column: return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(float("nan")), data_type=self.dtype + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) ), 1, ), @@ -924,7 +944,7 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column: return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(float("nan")), data_type=self.dtype + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) ), 1, ), diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index d630b40f600..f8441b793b5 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -146,9 +146,13 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: assert_never(self.typ) if row_index is not None: name, offset = row_index - # TODO: dtype - step = plc.interop.from_arrow(pa.scalar(1)) - init = plc.interop.from_arrow(pa.scalar(offset)) + dtype = self.schema[name] + step = plc.interop.from_arrow( + pa.scalar(1, type=plc.interop.to_arrow(dtype)) + ) + init = plc.interop.from_arrow( + pa.scalar(offset, type=plc.interop.to_arrow(dtype)) + ) index = Column( plc.filling.sequence(df.num_rows, init, step), name ).set_sorted( diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index b3d0edf183f..9a301164beb 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -9,9 +9,11 @@ from functools import singledispatch from typing import Any +import pyarrow as pa + from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir -import cudf._lib.pylibcudf as plc # noqa: TCH002, singledispatch register needs this name defined. +import cudf._lib.pylibcudf as plc from cudf_polars.dsl import expr, ir from cudf_polars.utils import dtypes @@ -295,7 +297,8 @@ def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr: - return expr.Literal(dtype, node.value) + value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) + return expr.Literal(dtype, value) @_translate_expr.register @@ -337,7 +340,7 @@ def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: inner = translate_expr(visitor, n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): - return expr.Literal(dtype, inner.value) + return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) else: return expr.Cast(dtype, inner) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 51379433c03..bede0de3c9f 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -13,6 +13,8 @@ import cudf._lib.pylibcudf as plc +__all__ = ["from_polars"] + @cache def from_polars(dtype: pl.DataType) -> plc.DataType: @@ -84,6 +86,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: # TODO: Hopefully return plc.DataType(plc.TypeId.EMPTY) elif isinstance(dtype, pl.List): + # TODO: This doesn't consider the value type. return plc.DataType(plc.TypeId.LIST) else: raise NotImplementedError(f"{dtype=} conversion not supported") diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 49ecd7080b9..e50ee76a9b9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -134,7 +134,7 @@ ignore = [ fixable = ["ALL"] [tool.ruff.lint.per-file-ignores] -"**/tests/**/test_*.py" = ["D", "INP"] +"**/tests/**/*.py" = ["D"] [tool.ruff.lint.flake8-pytest-style] # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style diff --git a/python/cudf_polars/tests/__init__.py b/python/cudf_polars/tests/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/expressions/__init__.py b/python/cudf_polars/tests/expressions/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/expressions/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index c792ae64f74..645dbd26140 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -56,7 +56,7 @@ def test_agg(df, agg): q = df.select(expr) # https://github.com/rapidsai/cudf/issues/15852 - check_dtype = agg not in {"count", "n_unique", "median"} + check_dtype = agg not in {"n_unique", "median"} if not check_dtype and q.schema["a"] != pl.Float64: with pytest.raises(AssertionError): assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/expressions/test_distinct.py b/python/cudf_polars/tests/expressions/test_distinct.py new file mode 100644 index 00000000000..22865a7ce22 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_distinct.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"]) +def nullable(request): + return request.param + + +@pytest.fixture( + params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"] +) +def op(request): + return request.param + + +@pytest.fixture +def df(nullable): + values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1] + if nullable: + values[1] = None + values[4] = None + return pl.LazyFrame({"a": values}) + + +def test_expr_distinct(df, op): + expr = getattr(pl.col("a"), op)() + query = df.select(expr) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index b75e1bdef10..b2443e357e2 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -10,17 +10,7 @@ @pytest.fixture( - params=[ - (None, None), - pytest.param( - ("row-index", 0), - marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), - ), - pytest.param( - ("index", 10), - marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), - ), - ], + params=[(None, None), ("row-index", 0), ("index", 10)], ids=["no-row-index", "zero-offset-row-index", "offset-row-index"], ) def row_index(request): From faf39299ebf178ee10971e4222c534f00d035b6d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:52:51 -1000 Subject: [PATCH 043/340] Make Frame.astype return Self instead of a ColumnAccessor (#15861) Allows simplification for it's subclasses (`IndexFrame.astype`, `Index.astype`) Also minor cleanups in the `equals` method Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15861 --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 23 ++++++----------------- python/cudf/cudf/core/index.py | 22 ++++++++++++++-------- python/cudf/cudf/core/indexed_frame.py | 14 +++++--------- 5 files changed, 27 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e6868ae3431..baca7b19e58 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -149,7 +149,7 @@ def ndim(self) -> int: # noqa: D401 """Number of dimensions of the underlying data, by definition 1.""" return 1 - def equals(self, other): + def equals(self, other) -> bool: """ Determine if two Index objects contain the same elements. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index acfc2d781a7..0fc36fa80e4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2590,7 +2590,7 @@ def items(self): yield (k, self[k]) @_cudf_nvtx_annotate - def equals(self, other): + def equals(self, other) -> bool: ret = super().equals(other) # If all other checks matched, validate names. if ret: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d60c206ac24..7326696c994 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -273,20 +273,13 @@ def __len__(self) -> int: return self._num_rows @_cudf_nvtx_annotate - def astype(self, dtype, copy: bool = False): - result_data = { - col_name: col.astype(dtype.get(col_name, col.dtype), copy=copy) + def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self: + casted = ( + col.astype(dtype.get(col_name, col.dtype), copy=copy) for col_name, col in self._data.items() - } - - return ColumnAccessor( - data=result_data, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - rangeindex=self._data.rangeindex, - label_dtype=self._data.label_dtype, - verify=False, ) + ca = self._data._from_columns_like_self(casted, verify=False) + return self._from_data_like_self(ca) @_cudf_nvtx_annotate def equals(self, other) -> bool: @@ -349,11 +342,7 @@ def equals(self, other) -> bool: """ if self is other: return True - if ( - other is None - or not isinstance(other, type(self)) - or len(self) != len(other) - ): + if not isinstance(other, type(self)) or len(self) != len(other): return False return all( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2a75b374a1e..9b4c5473438 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -445,7 +445,7 @@ def __getitem__(self, index): return self._as_int_index()[index] @_cudf_nvtx_annotate - def equals(self, other): + def equals(self, other) -> bool: if isinstance(other, RangeIndex): return self._range == other._range return self._as_int_index().equals(other) @@ -1058,6 +1058,16 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: out.name = name return out + @classmethod + @_cudf_nvtx_annotate + def _from_data_like_self( + cls, data: MutableMapping, name: Any = no_default + ) -> Self: + out = _index_from_data(data, name) + if name is not no_default: + out.name = name + return out + @classmethod @_cudf_nvtx_annotate def from_arrow(cls, obj): @@ -1180,12 +1190,8 @@ def is_unique(self): return self._column.is_unique @_cudf_nvtx_annotate - def equals(self, other): - if ( - other is None - or not isinstance(other, BaseIndex) - or len(self) != len(other) - ): + def equals(self, other) -> bool: + if not isinstance(other, BaseIndex) or len(self) != len(other): return False check_dtypes = False @@ -1231,7 +1237,7 @@ def copy(self, name=None, deep=False): @_cudf_nvtx_annotate def astype(self, dtype, copy: bool = True): - return _index_from_data(super().astype({self.name: dtype}, copy)) + return super().astype({self.name: dtype}, copy) @_cudf_nvtx_annotate def get_indexer(self, target, method=None, limit=None, tolerance=None): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a31430e1571..5a466f20f8c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -625,10 +625,8 @@ def copy(self, deep: bool = True) -> Self: ) @_cudf_nvtx_annotate - def equals(self, other): # noqa: D102 - if not super().equals(other): - return False - return self.index.equals(other.index) + def equals(self, other) -> bool: # noqa: D102 + return super().equals(other) and self.index.equals(other.index) @property def index(self): @@ -4896,10 +4894,10 @@ def repeat(self, repeats, axis=None): def astype( self, - dtype, + dtype: dict[Any, Dtype], copy: bool = False, errors: Literal["raise", "ignore"] = "raise", - ): + ) -> Self: """Cast the object to the given dtype. Parameters @@ -5010,14 +5008,12 @@ def astype( raise ValueError("invalid error value specified") try: - data = super().astype(dtype, copy) + return super().astype(dtype, copy) except Exception as e: if errors == "raise": raise e return self - return self._from_data(data, index=self.index) - @_cudf_nvtx_annotate def drop( self, From fe7412915a289e7a9469040ada1dcf74cda2c4d6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:56:25 -1000 Subject: [PATCH 044/340] Make Column.to_pandas return Index instead of Series (#15833) Column.to_pandas backs `Index.to_pandas`/`Series.to_pandas`/`DataFrame.to_pandas` and returned a `pandas.Series`; however, the `index` of this `pandas.Series` was not strictly necessary for `Index.to_pandas` and `DataFrame.to_pandas`. Additionally, `pandas.Index` is 1D-like like `Column` and provides a better mental model to `to_pandas` conversion. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15833 --- python/cudf/cudf/core/column/categorical.py | 7 ++- python/cudf/cudf/core/column/column.py | 13 ++---- python/cudf/cudf/core/column/datetime.py | 20 ++------- python/cudf/cudf/core/column/interval.py | 15 ++----- python/cudf/cudf/core/column/lists.py | 20 ++------- python/cudf/cudf/core/column/numerical.py | 17 +++---- python/cudf/cudf/core/column/string.py | 17 ++----- python/cudf/cudf/core/column/struct.py | 19 ++------ python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/index.py | 45 ++++--------------- python/cudf/cudf/core/series.py | 8 ++-- .../cudf/tests/test_cuda_array_interface.py | 4 +- 12 files changed, 46 insertions(+), 143 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 0ff8209dcd4..1828c5ce97b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -789,12 +789,11 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: if nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) elif arrow_type: raise NotImplementedError(f"{arrow_type=} is not implemented.") @@ -828,7 +827,7 @@ def to_pandas( data = pd.Categorical.from_codes( codes, categories=cats.to_pandas(), ordered=col.ordered ) - return pd.Series(data, index=index) + return pd.Index(data) def to_arrow(self) -> pa.Array: """Convert to PyArrow Array.""" diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 59bae179497..68079371b85 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -190,10 +190,9 @@ def __repr__(self): def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: """Convert object to pandas type. The default implementation falls back to PyArrow for the conversion. @@ -208,15 +207,9 @@ def to_pandas( raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + return pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) else: - pd_series = pa_array.to_pandas() - - if index is not None: - pd_series.index = index - return pd_series + return pd.Index(pa_array.to_pandas()) @property def values_host(self) -> "np.ndarray": diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 27f31c8f500..057169aa7e1 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -840,27 +840,15 @@ def __init__( def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - elif arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index - ) + ) -> pd.Index: + if arrow_type or nullable: + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) else: - series = self._local_time.to_pandas().dt.tz_localize( + return self._local_time.to_pandas().tz_localize( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" ) - if index is not None: - series.index = index - return series def to_arrow(self): return pa.compute.assume_timezone( diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 7bd693966dc..f24ca3fdad1 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,6 +1,4 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. -from typing import Optional - import pandas as pd import pyarrow as pa @@ -109,28 +107,21 @@ def as_interval_column(self, dtype): def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: # Note: This does not handle null values in the interval column. # However, this exact sequence (calling __from_arrow__ on the output of # self.to_arrow) is currently the best known way to convert interval # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) if nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) elif arrow_type: raise NotImplementedError(f"{arrow_type=} is not implemented.") pd_type = self.dtype.to_pandas() - return pd.Series( - pd_type.__from_arrow__(self.to_arrow()), index=index, dtype=pd_type - ) + return pd.Index(pd_type.__from_arrow__(self.to_arrow()), dtype=pd_type) def element_indexing(self, index: int): result = super().element_indexing(index) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 1c2bcbef2ec..8f8ee46c796 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -292,25 +292,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self: def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: - # Can't rely on Column.to_pandas implementation for lists. - # Need to perform `to_pylist` to preserve list types. - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - if nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - pa_array = self.to_arrow() - if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + ) -> pd.Index: + if arrow_type or nullable: + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) else: - return pd.Series(pa_array.tolist(), dtype="object", index=index) + return pd.Index(self.to_arrow().tolist(), dtype="object") class ListMethods(ColumnMethods): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index bab862f775f..fb413959eb9 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -674,18 +674,13 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) elif arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index - ) + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) elif ( nullable and ( @@ -697,11 +692,11 @@ def to_pandas( ): arrow_array = self.to_arrow() pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) # type: ignore[attr-defined] - return pd.Series(pandas_array, copy=False, index=index) + return pd.Index(pandas_array, copy=False) elif self.dtype.kind in set("iuf") and not self.has_nulls(): - return pd.Series(self.values_host, copy=False, index=index) + return pd.Index(self.values_host, copy=False) else: - return super().to_pandas(index=index, nullable=nullable) + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) def _reduction_result_dtype(self, reduction_op: str) -> Dtype: col_dtype = self.dtype diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 40e58e14612..fd98d0dc163 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5783,23 +5783,14 @@ def values(self) -> cupy.ndarray: def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index - ) - elif nullable: + ) -> pd.Index: + if nullable and not arrow_type: pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) - return pd.Series(pandas_array, copy=False, index=index) + return pd.Index(pandas_array, copy=False) else: - return super().to_pandas(index=index, nullable=nullable) + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) def can_cast_safely(self, to_dtype: Dtype) -> bool: to_dtype = cudf.api.types.dtype(to_dtype) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 1b2ffcc2700..6dd35570b95 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -2,7 +2,6 @@ from __future__ import annotations from functools import cached_property -from typing import Optional import pandas as pd import pyarrow as pa @@ -60,25 +59,15 @@ def to_arrow(self): def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - pa_array = self.to_arrow() - if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + if arrow_type or nullable: + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) else: - return pd.Series(pa_array.tolist(), dtype="object", index=index) + return pd.Index(self.to_arrow().tolist(), dtype="object") @cached_property def memory_usage(self): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0fc36fa80e4..4c55b5427de 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5321,9 +5321,7 @@ def to_pandas( """ out_index = self.index.to_pandas() out_data = { - i: col.to_pandas( - index=out_index, nullable=nullable, arrow_type=arrow_type - ) + i: col.to_pandas(nullable=nullable, arrow_type=arrow_type) for i, col in enumerate(self._data.columns) } diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 9b4c5473438..4b09765fa46 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1568,10 +1568,11 @@ def any(self): def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.Index: - return pd.Index( - self._values.to_pandas(nullable=nullable, arrow_type=arrow_type), - name=self.name, + result = self._column.to_pandas( + nullable=nullable, arrow_type=arrow_type ) + result.name = self.name + return result def append(self, other): if is_list_like(other): @@ -2191,23 +2192,10 @@ def isocalendar(self): def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.DatetimeIndex: - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - - result = self._values.to_pandas(arrow_type=arrow_type) - if arrow_type: - return pd.Index(result, name=self.name) - else: - freq = ( - self._freq._maybe_as_fast_pandas_offset() - if self._freq is not None - else None - ) - return pd.DatetimeIndex(result, name=self.name, freq=freq) + result = super().to_pandas(nullable=nullable, arrow_type=arrow_type) + if not arrow_type and self._freq is not None: + result.freq = self._freq._maybe_as_fast_pandas_offset() + return result @_cudf_nvtx_annotate def _get_dt_field(self, field): @@ -2527,23 +2515,6 @@ def __getitem__(self, index): return pd.Timedelta(value) return value - @_cudf_nvtx_annotate - def to_pandas( - self, *, nullable: bool = False, arrow_type: bool = False - ) -> pd.TimedeltaIndex: - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - - result = self._values.to_pandas(arrow_type=arrow_type) - if arrow_type: - return pd.Index(result, name=self.name) - else: - return pd.TimedeltaIndex(result, name=self.name) - @property # type: ignore @_cudf_nvtx_annotate def days(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a5b204ef346..169f7c11cf9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2022,11 +2022,11 @@ def to_pandas( index = self.index.to_pandas() else: index = None # type: ignore[assignment] - s = self._column.to_pandas( - index=index, nullable=nullable, arrow_type=arrow_type + return pd.Series( + self._column.to_pandas(nullable=nullable, arrow_type=arrow_type), + index=index, + name=self.name, ) - s.name = self.name - return s @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index f98c3ad0475..06d63561fc1 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -175,12 +175,12 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): a = cudf.Series(cupy.asarray([1, 2, 3]))._column a = cudf.core.column.as_column(a) b = cupy.asarray([1, 1, 1]) # noqa: F841 - assert_eq(pd.Series([1, 2, 3]), a.to_pandas()) + assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) a = cudf.Series(cupy.asarray([1, 2, 3]))._column a.name = "b" b = cupy.asarray([1, 1, 1]) # noqa: F841 - assert_eq(pd.Series([1, 2, 3]), a.to_pandas()) + assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) @pytest.mark.xfail( From 22ef0634f07f7b40d718e80bed176e88ac734ebe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 14:58:11 -1000 Subject: [PATCH 045/340] Remove internal usage of core.index.as_index in favor of cudf.Index (#15851) `cudf.Index.__init__` essentially calls `as_index` immediately internally. To avoid both from potentially diverging, the public `cudf.Index` should be preferred to ensure the public behaviors are used internally Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15851 --- python/cudf/cudf/core/algorithms.py | 4 +- python/cudf/cudf/core/column/methods.py | 4 +- python/cudf/cudf/core/column/string.py | 4 +- python/cudf/cudf/core/cut.py | 4 +- python/cudf/cudf/core/dataframe.py | 36 +++++++-------- python/cudf/cudf/core/dtypes.py | 4 +- python/cudf/cudf/core/groupby/groupby.py | 6 +-- python/cudf/cudf/core/index.py | 30 +++++-------- python/cudf/cudf/core/indexed_frame.py | 4 +- python/cudf/cudf/core/multiindex.py | 7 +-- python/cudf/cudf/core/series.py | 8 ++-- python/cudf/cudf/core/tools/datetimes.py | 5 +-- python/cudf/cudf/tests/test_array_function.py | 4 +- python/cudf/cudf/tests/test_binops.py | 31 +++++++------ python/cudf/cudf/tests/test_contains.py | 6 +-- python/cudf/cudf/tests/test_dlpack.py | 2 +- python/cudf/cudf/tests/test_index.py | 44 ++++++++----------- python/cudf/cudf/tests/test_multiindex.py | 7 +-- python/cudf/cudf/tests/test_string.py | 38 ++++++++-------- .../cudf/cudf/tests/text/test_text_methods.py | 8 ++-- 20 files changed, 116 insertions(+), 140 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 272abdece9e..51a32e29886 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -6,7 +6,7 @@ from cudf.core.column import as_column from cudf.core.copy_types import BooleanMask -from cudf.core.index import RangeIndex, as_index +from cudf.core.index import Index, RangeIndex from cudf.core.indexed_frame import IndexedFrame from cudf.core.scalar import Scalar from cudf.options import get_option @@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): dtype="int64" if get_option("mode.pandas_compatible") else None, ).values - return labels, cats.values if return_cupy_array else as_index(cats) + return labels, cats.values if return_cupy_array else Index(cats) def _linear_interpolation(column, index=None): diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index e827c7a3dd3..7f7355c571a 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -93,8 +93,6 @@ def _return_or_inplace( else: return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.core.index.as_index( - new_col, name=self._parent.name - ) + return cudf.Index(new_col, name=self._parent.name) else: return self._parent._mimic_inplace(new_col, inplace=False) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fd98d0dc163..d12aa80e9a3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4391,7 +4391,7 @@ def code_points(self) -> SeriesOrIndex: if isinstance(self._parent, cudf.Series): return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.core.index.as_index(new_col, name=self._parent.name) + return cudf.Index(new_col, name=self._parent.name) else: return new_col @@ -4706,7 +4706,7 @@ def character_tokenize(self) -> SeriesOrIndex: index = self._parent.index.repeat(lengths) return cudf.Series(result_col, name=self._parent.name, index=index) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.core.index.as_index(result_col, name=self._parent.name) + return cudf.Index(result_col, name=self._parent.name) else: return result_col diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index ccf730c91fb..54c5e829e8a 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from collections import abc @@ -292,7 +292,7 @@ def cut( ) # we return a categorical index, as we don't have a Categorical method - categorical_index = cudf.core.index.as_index(col) + categorical_index = cudf.Index(col) if isinstance(orig_x, (pd.Series, cudf.Series)): # if we have a series input we return a series output diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4c55b5427de..c8f1e872300 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -712,7 +712,7 @@ def __init__( data = data.reindex(index) index = data.index else: - index = as_index(index) + index = cudf.Index(index) else: index = data.index @@ -761,7 +761,7 @@ def __init__( if index is None: self._index = RangeIndex(0) else: - self._index = as_index(index) + self._index = cudf.Index(index) if columns is not None: rangeindex = isinstance( columns, (range, pd.RangeIndex, cudf.RangeIndex) @@ -875,7 +875,7 @@ def _init_from_series_list(self, data, columns, index): # When `index` is `None`, the final index of # resulting dataframe will be union of # all Series's names. - final_index = as_index(_get_union_of_series_names(data)) + final_index = cudf.Index(_get_union_of_series_names(data)) else: # When an `index` is passed, the final index of # resulting dataframe will be whatever @@ -919,7 +919,7 @@ def _init_from_series_list(self, data, columns, index): f"not match length of index ({index_length})" ) - final_index = as_index(index) + final_index = cudf.Index(index) series_lengths = list(map(len, data)) data = numeric_normalize_types(*data) @@ -943,7 +943,7 @@ def _init_from_series_list(self, data, columns, index): # Setting `final_columns` to self._index so # that the resulting `transpose` will be have # columns set to `final_columns` - self._index = as_index(final_columns) + self._index = cudf.Index(final_columns) transpose = self.T else: @@ -987,9 +987,9 @@ def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) else: - index = as_index(index) + index = cudf.Index(index) - self._index = as_index(index) + self._index = cudf.Index(index) # list-of-dicts case if len(data) > 0 and isinstance(data[0], dict): data = DataFrame.from_pandas(pd.DataFrame(data)) @@ -1095,7 +1095,7 @@ def _init_from_dict_like( self._index = RangeIndex(0, num_rows) else: - self._index = as_index(index) + self._index = cudf.Index(index) if len(data): self._data.multiindex = True @@ -1410,7 +1410,7 @@ def __setitem__(self, arg, value): new_columns, verify=False ) if isinstance(value, (pd.Series, Series)): - self._index = as_index(value.index) + self._index = cudf.Index(value.index) elif len(value) > 0: self._index = RangeIndex(length) return @@ -1728,7 +1728,7 @@ def _concat( for cols in columns: table_index = None if 1 == first_data_column_position: - table_index = cudf.core.index.as_index(cols[0]) + table_index = cudf.Index(cols[0]) elif first_data_column_position > 1: table_index = DataFrame._from_data( data=dict( @@ -1780,9 +1780,7 @@ def _concat( if not isinstance(out.index, MultiIndex) and isinstance( out.index.dtype, cudf.CategoricalDtype ): - out = out.set_index( - cudf.core.index.as_index(out.index._values) - ) + out = out.set_index(cudf.Index(out.index._values)) for name, col in out._data.items(): out._data[name] = col._with_type_metadata( tables[0]._data[name].dtype @@ -2828,7 +2826,7 @@ def reindex( if columns is None: df = self else: - columns = as_index(columns) + columns = cudf.Index(columns) intersection = self._data.to_pandas_index().intersection( columns.to_pandas() ) @@ -3245,7 +3243,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if len(self) == 0: if isinstance(value, (pd.Series, Series)): if not ignore_index: - self.index = as_index(value.index) + self.index = cudf.Index(value.index) elif (length := len(value)) > 0: if num_cols != 0: ca = self._data._from_columns_like_self( @@ -5654,7 +5652,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): } if not is_scalar(index): - new_index = as_index(index) + new_index = cudf.Index(index) else: new_index = None @@ -5738,7 +5736,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): } if index is not None: - index = as_index(index) + index = cudf.Index(index) if isinstance(columns, (pd.Index, cudf.Index)): level_names = tuple(columns.names) @@ -6171,7 +6169,7 @@ def count(self, axis=0, numeric_only=False): for col in self._data.names ] }, - as_index(self._data.names), + cudf.Index(self._data.names), ) _SUPPORT_AXIS_LOOKUP = { @@ -6298,7 +6296,7 @@ def _reduce( source._data.names, names=source._data.level_names ) else: - idx = as_index(source._data.names) + idx = cudf.Index(source._data.names) return Series._from_data({None: as_column(result)}, idx) elif axis == 1: return source._apply_cupy_method_axis_1(op, **kwargs) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 9bb1995b836..4729233ee6e 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -186,10 +186,10 @@ def categories(self) -> "cudf.core.index.Index": Index(['b', 'a'], dtype='object') """ if self._categories is None: - return cudf.core.index.as_index( + return cudf.Index( cudf.core.column.column_empty(0, dtype="object", masked=False) ) - return cudf.core.index.as_index(self._categories, copy=False) + return cudf.Index(self._categories, copy=False) @property def type(self): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3e7a1ee6026..ac8b381cbec 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2800,15 +2800,13 @@ def keys(self): nkeys = len(self._key_columns) if nkeys == 0: - return cudf.core.index.as_index([], name=None) + return cudf.Index([], name=None) elif nkeys > 1: return cudf.MultiIndex._from_data( dict(zip(range(nkeys), self._key_columns)) )._set_names(self.names) else: - return cudf.core.index.as_index( - self._key_columns[0], name=self.names[0] - ) + return cudf.Index(self._key_columns[0], name=self.names[0]) @property def values(self) -> cudf.core.frame.Frame: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 4b09765fa46..7297ac4e929 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1169,7 +1169,7 @@ def _concat(cls, objs): result = _concat_range_index(non_empties) else: data = concat_columns([o._values for o in non_empties]) - result = as_index(data) + result = Index(data) names = {obj.name for obj in objs} if len(names) == 1: @@ -1437,7 +1437,7 @@ def __repr__(self): def __getitem__(self, index): res = self._get_elements_from_column(index) if isinstance(res, ColumnBase): - res = as_index(res, name=self.name) + res = Index(res, name=self.name) return res @property # type: ignore @@ -1958,7 +1958,7 @@ def microsecond(self): >>> datetime_index.microsecond Index([0, 1, 2], dtype='int32') """ # noqa: E501 - return as_index( + return Index( ( # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this @@ -2209,7 +2209,7 @@ def _get_dt_field(self, field): mask=out_column.base_mask, offset=out_column.offset, ) - return as_index(out_column, name=self.name) + return Index(out_column, name=self.name) def _is_boolean(self): return False @@ -2522,9 +2522,7 @@ def days(self): Number of days for each element. """ # Need to specifically return `int64` to avoid overflow. - return as_index( - arbitrary=self._values.days, name=self.name, dtype="int64" - ) + return Index(self._values.days, name=self.name, dtype="int64") @property # type: ignore @_cudf_nvtx_annotate @@ -2532,9 +2530,7 @@ def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return as_index( - arbitrary=self._values.seconds, name=self.name, dtype="int32" - ) + return Index(self._values.seconds, name=self.name, dtype="int32") @property # type: ignore @_cudf_nvtx_annotate @@ -2542,9 +2538,7 @@ def microseconds(self): """ Number of microseconds (>= 0 and less than 1 second) for each element. """ - return as_index( - arbitrary=self._values.microseconds, name=self.name, dtype="int32" - ) + return Index(self._values.microseconds, name=self.name, dtype="int32") @property # type: ignore @_cudf_nvtx_annotate @@ -2553,9 +2547,7 @@ def nanoseconds(self): Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. """ - return as_index( - arbitrary=self._values.nanoseconds, name=self.name, dtype="int32" - ) + return Index(self._values.nanoseconds, name=self.name, dtype="int32") @property # type: ignore @_cudf_nvtx_annotate @@ -2693,7 +2685,7 @@ def codes(self): """ The category codes of this categorical. """ - return as_index(self._values.codes) + return Index(self._values.codes) @property # type: ignore @_cudf_nvtx_annotate @@ -3137,7 +3129,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: elif step is None: # First non-empty index had only one element if obj.start == start: - result = as_index(concat_columns([x._values for x in indexes])) + result = Index(concat_columns([x._values for x in indexes])) return result step = obj.start - start @@ -3145,7 +3137,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: next_ is not None and obj.start != next_ ) if non_consecutive: - result = as_index(concat_columns([x._values for x in indexes])) + result = Index(concat_columns([x._values for x in indexes])) return result if step is not None: next_ = obj[-1] + step diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5a466f20f8c..688b268d478 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3638,7 +3638,7 @@ def _align_to_index( sort: bool = True, allow_non_unique: bool = False, ) -> Self: - index = cudf.core.index.as_index(index) + index = cudf.Index(index) if self.index.equals(index): return self @@ -3713,7 +3713,7 @@ def _reindex( raise ValueError( "cannot reindex on an axis with duplicate labels" ) - index = cudf.core.index.as_index( + index = cudf.Index( index, name=getattr(index, "name", self.index.name) ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 049fac45ba8..11b4b9154a2 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -30,7 +30,6 @@ BaseIndex, _get_indexer_basic, _lexsorted_equal_range, - as_index, ) from cudf.core.join._join_helpers import _match_join_keys from cudf.utils.dtypes import is_column_like @@ -824,7 +823,7 @@ def _index_and_downcast(self, result, index, index_key): # it into an Index and name the final index values according # to that column's name. *_, last_column = index._data.columns - out_index = as_index(last_column) + out_index = cudf.Index(last_column) out_index.name = index.names[-1] index = out_index elif out_index._num_columns > 1: @@ -1082,7 +1081,9 @@ def get_level_values(self, level): raise KeyError(f"Level not found: '{level}'") else: level_idx = colnames.index(level) - level_values = as_index(self._data[level], name=self.names[level_idx]) + level_values = cudf.Index( + self._data[level], name=self.names[level_idx] + ) return level_values def _is_numeric(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 169f7c11cf9..a52b583d3b4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -596,7 +596,7 @@ def __init__( name_from_data = data.name column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) if isinstance(data, pd.Series): - index_from_data = as_index(data.index) + index_from_data = cudf.Index(data.index) elif isinstance(data, Series): index_from_data = data.index elif isinstance(data, ColumnAccessor): @@ -612,7 +612,7 @@ def __init__( column = as_column( list(data.values()), nan_as_null=nan_as_null, dtype=dtype ) - index_from_data = as_index(list(data.keys())) + index_from_data = cudf.Index(list(data.keys())) else: # Using `getattr_static` to check if # `data` is on device memory and perform @@ -649,7 +649,7 @@ def __init__( name = name_from_data if index is not None: - index = as_index(index) + index = cudf.Index(index) if index_from_data is not None: first_index = index_from_data @@ -5241,7 +5241,7 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): if isinstance(a, cudf.Series) and isinstance(b, cudf.Series): b = b.reindex(a.index) - index = as_index(a.index) + index = cudf.Index(a.index) a_col = as_column(a) a_array = cupy.asarray(a_col.data_array_view(mode="read")) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 12a1ecc68e0..f002a838fa9 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -18,7 +18,6 @@ ) from cudf.api.types import is_integer, is_scalar from cudf.core import column -from cudf.core.index import as_index # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 _unit_map = { @@ -287,13 +286,13 @@ def to_datetime( utc=utc, ) if isinstance(arg, (cudf.BaseIndex, pd.Index)): - return as_index(col, name=arg.name) + return cudf.Index(col, name=arg.name) elif isinstance(arg, (cudf.Series, pd.Series)): return cudf.Series(col, index=arg.index, name=arg.name) elif is_scalar(arg): return col.element_indexing(0) else: - return as_index(col) + return cudf.Index(col) except Exception as e: if errors == "raise": raise e diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 58939f0ddd9..e6b89e2c5fa 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -108,7 +108,7 @@ def test_array_func_missing_cudf_dataframe(pd_df, func): ], ) def test_array_func_cudf_index(np_ar, func): - cudf_index = cudf.core.index.as_index(cudf.Series(np_ar)) + cudf_index = cudf.Index(cudf.Series(np_ar)) expect = func(np_ar) got = func(cudf_index) if np.isscalar(expect): @@ -128,7 +128,7 @@ def test_array_func_cudf_index(np_ar, func): ], ) def test_array_func_missing_cudf_index(np_ar, func): - cudf_index = cudf.core.index.as_index(cudf.Series(np_ar)) + cudf_index = cudf.Index(cudf.Series(np_ar)) with pytest.raises(TypeError): func(cudf_index) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 5d0c403daa2..fa371914c3e 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -12,10 +12,9 @@ import pytest import cudf -from cudf import Series +from cudf import Index, Series from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.core.buffer.spill_manager import get_global_manager -from cudf.core.index import as_index from cudf.testing import _utils as utils from cudf.utils.dtypes import ( BOOL_TYPES, @@ -186,8 +185,8 @@ def test_series_binop(binop, obj_class): sr2 = Series(arr2) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result = binop(sr1, sr2) expect = binop(pd.Series(arr1), pd.Series(arr2)) @@ -225,7 +224,7 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): sr = Series(arr) if obj_class == "Index": - sr = as_index(sr) + sr = Index(sr) if use_cudf_scalar: result = binop(sr, rhs) @@ -251,8 +250,8 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): sr2 = Series(arr2) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result = binop(sr1, sr2) @@ -274,8 +273,8 @@ def test_series_compare(cmpop, obj_class, dtype): sr2 = Series(arr2) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result1 = cmpop(sr1, sr1) result2 = cmpop(sr2, sr2) @@ -402,7 +401,7 @@ def test_series_compare_scalar( rhs = cudf.Scalar(rhs) if obj_class == "Index": - sr1 = as_index(sr1) + sr1 = Index(sr1) result1 = cmpop(sr1, rhs) result2 = cmpop(rhs, sr1) @@ -488,8 +487,8 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): sr2 = Series(rhs) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result = binop(Series(sr1), Series(sr2)) @@ -513,8 +512,8 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): sr2 = Series(rhs) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result = cmpop(Series(sr1), Series(sr2)) @@ -538,7 +537,7 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class): # class typing if obj_class == "Index": - gs = as_index(gs) + gs = Index(gs) gs_result = func(gs) @@ -588,7 +587,7 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class): # class typing if obj_class == "Index": - gs = as_index(gs) + gs = Index(gs) gs_result = gpu_func(gs) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 15dfa111860..a65ab1780b6 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime @@ -8,7 +8,7 @@ import cudf from cudf import Series -from cudf.core.index import RangeIndex, as_index +from cudf.core.index import Index, RangeIndex from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -74,7 +74,7 @@ def test_series_contains(values, item, expected): @pytest.mark.parametrize("values, item, expected", testdata_all) def test_index_contains(values, item, expected): - index = as_index(values) + index = Index(values) assert_eq(expected, item in index) diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index aafe920d3a1..7ea3979b0f1 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -101,7 +101,7 @@ def test_to_dlpack_index(data_1d): with expectation: if np.isnan(data_1d).any(): pytest.skip("Nulls not allowed in Index") - gi = cudf.core.index.as_index(data_1d) + gi = cudf.Index(data_1d) dlt = gi.to_dlpack() # PyCapsules are a C-API thing so couldn't come up with a better way diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index b92ae1b3364..3d6c71ebc1b 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -17,13 +17,7 @@ import cudf from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype -from cudf.core.index import ( - CategoricalIndex, - DatetimeIndex, - Index, - RangeIndex, - as_index, -) +from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex from cudf.testing._utils import ( ALL_TYPES, FLOAT_TYPES, @@ -200,11 +194,11 @@ def test_pandas_as_index(): pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) # Define cudf Indexes - gdf_int_index = as_index(pdf_int_index) - gdf_uint_index = as_index(pdf_uint_index) - gdf_float_index = as_index(pdf_float_index) - gdf_datetime_index = as_index(pdf_datetime_index) - gdf_category_index = as_index(pdf_category_index) + gdf_int_index = Index(pdf_int_index) + gdf_uint_index = Index(pdf_uint_index) + gdf_float_index = Index(pdf_float_index) + gdf_datetime_index = Index(pdf_datetime_index) + gdf_category_index = Index(pdf_category_index) # Check instance types assert isinstance(gdf_int_index, Index) @@ -232,7 +226,7 @@ def test_pandas_as_index(): @pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES) def test_index_rename(initial_name, name): pds = pd.Index([1, 2, 3], name=initial_name) - gds = as_index(pds) + gds = Index(pds) assert_eq(pds, gds) @@ -245,18 +239,18 @@ def test_index_rename(initial_name, name): and if name is being handles in recursive creation. """ pds = pd.Index(expect) - gds = as_index(got) + gds = Index(got) assert_eq(pds, gds) pds = pd.Index(pds, name="abc") - gds = as_index(gds, name="abc") + gds = Index(gds, name="abc") assert_eq(pds, gds) def test_index_rename_inplace(): pds = pd.Index([1, 2, 3], name="asdf") - gds = as_index(pds) + gds = Index(pds) # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) @@ -280,7 +274,7 @@ def test_index_rename_preserves_arg(): assert idx1.name == "orig_name" # a new object but referencing the same data - idx3 = as_index(idx1, name="last_name") + idx3 = Index(idx1, name="last_name") assert idx3.name == "last_name" assert idx1.name == "orig_name" @@ -456,7 +450,7 @@ def test_from_pandas_gen(): def test_index_names(): - idx = cudf.core.index.as_index([1, 2, 3], name="idx") + idx = Index([1, 2, 3], name="idx") assert idx.names == ("idx",) @@ -874,8 +868,8 @@ def test_index_equals(data, other): pd_data = pd.Index(data) pd_other = pd.Index(other) - gd_data = cudf.core.index.as_index(data) - gd_other = cudf.core.index.as_index(other) + gd_data = Index(data) + gd_other = Index(other) expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) @@ -920,8 +914,8 @@ def test_index_categories_equal(data, other): pd_data = pd.Index(data).astype("category") pd_other = pd.Index(other) - gd_data = cudf.core.index.as_index(data).astype("category") - gd_other = cudf.core.index.as_index(other) + gd_data = Index(data).astype("category") + gd_other = Index(other) expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) @@ -970,7 +964,7 @@ def test_index_equal_misc(data, other): pd_data = pd.Index(data) pd_other = other - gd_data = cudf.core.index.as_index(data) + gd_data = Index(data) gd_other = other expected = pd_data.equals(pd_other) @@ -1089,8 +1083,8 @@ def test_index_empty_append_name_conflict(): ], ) def test_index_append_error(data, other): - gd_data = cudf.core.index.as_index(data) - gd_other = cudf.core.index.as_index(other) + gd_data = Index(data) + gd_other = Index(other) got_dtype = ( gd_other.dtype diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index dd731fab8f3..f143112a45f 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -21,7 +21,6 @@ import cudf from cudf.api.extensions import no_default from cudf.core.column import as_column -from cudf.core.index import as_index from cudf.testing._utils import ( assert_eq, assert_exceptions_equal, @@ -158,8 +157,6 @@ def test_multiindex_swaplevel(): def test_string_index(): - from cudf.core.index import Index - pdf = pd.DataFrame(np.random.rand(5, 5)) gdf = cudf.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] @@ -170,11 +167,11 @@ def test_string_index(): pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) - stringIndex = Index(["a", "b", "c", "d", "e"], name="name") + stringIndex = cudf.Index(["a", "b", "c", "d", "e"], name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) - stringIndex = as_index(as_column(["a", "b", "c", "d", "e"]), name="name") + stringIndex = cudf.Index(as_column(["a", "b", "c", "d", "e"]), name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index de771a56e77..801c530da43 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -16,7 +16,7 @@ import cudf from cudf import concat from cudf.core.column.string import StringColumn -from cudf.core.index import Index, as_index +from cudf.core.index import Index from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -1500,7 +1500,7 @@ def test_strings_partition(data): assert_eq(ps.str.partition(","), gs.str.partition(",")) assert_eq(ps.str.partition("-"), gs.str.partition("-")) - gi = as_index(data, name="new name") + gi = cudf.Index(data, name="new name") pi = pd.Index(data, name="new name") assert_eq(pi.str.partition(), gi.str.partition()) assert_eq(pi.str.partition(","), gi.str.partition(",")) @@ -1639,7 +1639,7 @@ def test_strings_strip_tests(data, to_strip): ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip) ) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq(pi.str.strip(to_strip=to_strip), gi.str.strip(to_strip=to_strip)) @@ -1696,7 +1696,7 @@ def test_strings_filling_tests(data, width, fillchar): gs.str.rjust(width=width, fillchar=fillchar), ) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq( @@ -1731,7 +1731,7 @@ def test_strings_zfill_tests(data, width): assert_eq(ps.str.zfill(width=width), gs.str.zfill(width=width)) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq(pi.str.zfill(width=width), gi.str.zfill(width=width)) @@ -1763,7 +1763,7 @@ def test_strings_pad_tests(data, width, side, fillchar): gs.str.pad(width=width, side=side, fillchar=fillchar), ) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq( @@ -1807,7 +1807,7 @@ def test_string_wrap(data, width): ), ) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq( @@ -1941,7 +1941,7 @@ def test_string_replace_with_backrefs(find, replace): expected = ps.str.replace(find, replace, regex=True) assert_eq(got, expected) - got = as_index(gs).str.replace_with_backrefs(find, replace) + got = cudf.Index(gs).str.replace_with_backrefs(find, replace) expected = pd.Index(ps).str.replace(find, replace, regex=True) assert_eq(got, expected) @@ -2227,7 +2227,7 @@ def test_string_str_rindex(data, sub, er): assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) assert_eq( pd.Index(ps).str.rindex(sub), - as_index(gs).str.rindex(sub), + cudf.Index(gs).str.rindex(sub), exact=False, ) @@ -2336,7 +2336,7 @@ def test_string_str_match(data, pat): assert_eq(ps.str.match(pat), gs.str.match(pat)) assert_eq( - pd.Index(pd.Index(ps).str.match(pat)), as_index(gs).str.match(pat) + pd.Index(pd.Index(ps).str.match(pat)), cudf.Index(gs).str.match(pat) ) @@ -2363,7 +2363,7 @@ def test_string_str_translate(data): ) assert_eq( pd.Index(ps).str.translate(str.maketrans({"a": "z"})), - as_index(gs).str.translate(str.maketrans({"a": "z"})), + cudf.Index(gs).str.translate(str.maketrans({"a": "z"})), ) assert_eq( ps.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), @@ -2373,7 +2373,7 @@ def test_string_str_translate(data): pd.Index(ps).str.translate( str.maketrans({"a": "z", "i": "$", "z": "1"}) ), - as_index(gs).str.translate( + cudf.Index(gs).str.translate( str.maketrans({"a": "z", "i": "$", "z": "1"}) ), ) @@ -2389,7 +2389,7 @@ def test_string_str_translate(data): pd.Index(ps).str.translate( str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) ), - as_index(gs).str.translate( + cudf.Index(gs).str.translate( str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) ), ) @@ -2779,8 +2779,8 @@ def test_string_str_byte_count(data, expected): actual = sr.str.byte_count() assert_eq(expected, actual) - si = as_index(data) - expected = as_index(expected, dtype="int32") + si = cudf.Index(data) + expected = cudf.Index(expected, dtype="int32") actual = si.str.byte_count() assert_eq(expected, actual) @@ -2828,8 +2828,8 @@ def test_str_isinteger(data, expected): actual = sr.str.isinteger() assert_eq(expected, actual) - sr = as_index(data) - expected = as_index(expected) + sr = cudf.Index(data) + expected = cudf.Index(expected) actual = sr.str.isinteger() assert_eq(expected, actual) @@ -2884,8 +2884,8 @@ def test_str_isfloat(data, expected): actual = sr.str.isfloat() assert_eq(expected, actual) - sr = as_index(data) - expected = as_index(expected) + sr = cudf.Index(data) + expected = cudf.Index(expected) actual = sr.str.isfloat() assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 6ecead862bb..6bd3b99bae1 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -539,7 +539,7 @@ def test_character_tokenize_series(): def test_character_tokenize_index(): - sr = cudf.core.index.as_index( + sr = cudf.Index( [ "hello world", "sdf", @@ -550,7 +550,7 @@ def test_character_tokenize_index(): ), ] ) - expected = cudf.core.index.as_index( + expected = cudf.Index( [ "h", "e", @@ -648,8 +648,8 @@ def test_character_tokenize_index(): actual = sr.str.character_tokenize() assert_eq(expected, actual) - sr = cudf.core.index.as_index(["a"]) - expected = cudf.core.index.as_index(["a"]) + sr = cudf.Index(["a"]) + expected = cudf.Index(["a"]) actual = sr.str.character_tokenize() assert_eq(expected, actual) From db1b36592ba5d76158d1c6e1a3c6440c25a382e7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:48:20 -0700 Subject: [PATCH 046/340] Migrate string replace.pxd to pylibcudf (#15839) xref #15162 Change replace.pxd to use pylibcudf APIs. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15839 --- .../user_guide/api_docs/pylibcudf/index.rst | 8 +- .../api_docs/pylibcudf/strings/index.rst | 7 + .../api_docs/pylibcudf/strings/replace.rst | 6 + .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.py | 2 +- .../cudf/_lib/pylibcudf/strings/replace.pxd | 25 +++ .../cudf/_lib/pylibcudf/strings/replace.pyx | 162 ++++++++++++++++++ python/cudf/cudf/_lib/strings/replace.pyx | 99 +++-------- .../pylibcudf_tests/test_string_replace.py | 126 ++++++++++++++ 10 files changed, 362 insertions(+), 79 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_replace.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 58fea77adaa..b6ad1157511 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -6,7 +6,7 @@ This page provides API documentation for pylibcudf. .. toctree:: :maxdepth: 1 - :caption: API Documentation + :caption: Top-level modules aggregation binaryop @@ -32,3 +32,9 @@ This page provides API documentation for pylibcudf. table types unary + +.. toctree:: + :maxdepth: 2 + :caption: Subpackages + + strings/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst new file mode 100644 index 00000000000..8970fc80c0b --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -0,0 +1,7 @@ +strings +======= + +.. toctree:: + :maxdepth: 1 + + replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst new file mode 100644 index 00000000000..9575ec226a7 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst @@ -0,0 +1,6 @@ +======= +replace +======= + +.. automodule:: cudf._lib.pylibcudf.strings.replace + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index 0e9c1c916f0..c9a983e24f4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,11 +12,11 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index ec3dbc150b5..7563df8a107 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, char_types, find +from . cimport capitalize, case, char_types, find, replace diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index 3793bda0aa4..cb4f0e38f97 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, char_types, find +from . import capitalize, case, char_types, find, replace diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd new file mode 100644 index 00000000000..52e2dc3c738 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar + + +cpdef Column replace( + Column input, + Scalar target, + Scalar repl, + size_type maxrepl = * +) +cpdef Column replace_multiple( + Column input, + Column target, + Column repl, + size_type maxrepl = * +) +cpdef Column replace_slice( + Column input, + Scalar repl = *, + size_type start = *, + size_type stop = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx new file mode 100644 index 00000000000..c757150a600 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx @@ -0,0 +1,162 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar +from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( + replace as cpp_replace, + replace_multiple as cpp_replace_multiple, + replace_slice as cpp_replace_slice, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar + + +cpdef Column replace( + Column input, + Scalar target, + Scalar repl, + size_type maxrepl = -1 +): + """Replaces target string within each string with the specified replacement string. + + Null string entries will return null output string entries. + + For details, see :cpp:func:`replace`. + + Parameters + ---------- + input : Column + The input strings + target : Scalar + String to search for in each string. + repl : Scalar + String to replace target with. + maxrepl : size_type, default -1 + Maximum times to replace if target appears multiple times in the input string. + Default of -1 specifies to replace all occurrences of target in each string. + + Returns + ------- + pylibcudf.Column + New string column with target replaced. + """ + cdef: + unique_ptr[column] c_result + const string_scalar* target_str + const string_scalar* repl_str + + target_str = (target.c_obj.get()) + repl_str = (repl.c_obj.get()) + + with nogil: + c_result = move(cpp_replace( + input.view(), + target_str[0], + repl_str[0], + maxrepl, + )) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column replace_multiple( + Column input, + Column target, + Column repl, + size_type maxrepl = -1 +): + """Replaces target string within each string with the specified replacement string. + + Null string entries will return null output string entries. + + For details, see :cpp:func:`replace_multiple`. + + Parameters + ---------- + input : Column + The input strings + target : Column + Column containing strings to search for in the input column. + repl : Column + Column containing strings to replace target with. + Each target, when found, will be replaced by the value at the + corresponding index in the repl Column. + + Must be of the same length as target. + + Returns + ------- + pylibcudf.Column + New string column with target replaced. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_replace_multiple( + input.view(), + target.view(), + repl.view(), + )) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column replace_slice( + Column input, + # TODO: default scalar values + # https://github.com/rapidsai/cudf/issues/15505 + Scalar repl = None, + size_type start = 0, + size_type stop = -1 +): + """Replaces each string in the column with the provided repl string + within the [start,stop) character position range. + + Null string entries will return null output string entries. + This function can be used to insert a string into specific position + by specifying the same position value for start and stop. + The repl string can be appended to each string by specifying -1 + for both start and stop. + + For details, see :cpp:func:`replace_slice`. + + Parameters + ---------- + input : Column + The input strings + repl : Scalar, default "" + String scalar to replace target with. + start : size_type, default 0 + Start position where repl will be added. + stop : size_type, default -1 + End position (exclusive) to use for replacement. + Returns + ------- + pylibcudf.Column + New string column + """ + cdef unique_ptr[column] c_result + + if repl is None: + repl = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef const string_scalar* scalar_str = (repl.c_obj.get()) + + with nogil: + c_result = move(cpp_replace_slice( + input.view(), + scalar_str[0], + start, + stop + )) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx index 2d9330a8a24..374831f1833 100644 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ b/python/cudf/cudf/_lib/strings/replace.pyx @@ -1,23 +1,15 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar -from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( - replace as cpp_replace, - replace_multiple as cpp_replace_multiple, - replace_slice as cpp_replace_slice, -) from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.scalar cimport DeviceScalar +import cudf._lib.pylibcudf as plc + @acquire_spill_lock() def slice_replace(Column source_strings, @@ -32,22 +24,12 @@ def slice_replace(Column source_strings, cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace_slice( - source_view, - scalar_str[0], - start, - stop - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_slice( + source_strings.to_pylibcudf(mode="read"), + repl.c_value, + start, + stop + )) @acquire_spill_lock() @@ -61,22 +43,12 @@ def insert(Column source_strings, cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace_slice( - source_view, - scalar_str[0], - start, - start - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_slice( + source_strings.to_pylibcudf(mode="read"), + repl.c_value, + start, + start, + )) @acquire_spill_lock() @@ -92,25 +64,12 @@ def replace(Column source_strings, cdef DeviceScalar target = py_target.device_value cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_target = ( - target.get_raw_ptr() - ) - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace( - source_view, - scalar_target[0], - scalar_repl[0], - maxrepl - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace( + source_strings.to_pylibcudf(mode="read"), + target.c_value, + repl.c_value, + maxrepl + )) @acquire_spill_lock() @@ -121,16 +80,8 @@ def replace_multi(Column source_strings, Returns a Column after replacing occurrences of patterns `target_strings` with `repl_strings` in `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - cdef column_view repl_view = repl_strings.view() - - with nogil: - c_result = move(cpp_replace_multiple( - source_view, - target_view, - repl_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read"), + repl_strings.to_pylibcudf(mode="read"), + )) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py new file mode 100644 index 00000000000..f20edf6a506 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py @@ -0,0 +1,126 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def data_col(): + pa_data_col = pa.array( + ["a", "c", "A", "aa", None, "aaaaaaaaa", "AAAA", "ÁÁÁÁ"], + type=pa.string(), + ) + return pa_data_col, plc.interop.from_arrow(pa_data_col) + + +@pytest.fixture(scope="module", params=["a", "c", "A", "Á", "aa", "ÁÁÁ"]) +def scalar_repl_target(request): + pa_target = pa.scalar(request.param, type=pa.string()) + return request.param, plc.interop.from_arrow(pa_target) + + +@pytest.fixture(scope="module", params=["b", "B", "", "B́"]) +def scalar_repl(request): + pa_repl = pa.scalar(request.param, type=pa.string()) + return request.param, plc.interop.from_arrow(pa_repl) + + +@pytest.fixture( + scope="module", + params=[ + ["a", "c", "A", "ÁÁÁÁ"], + ], +) +def col_repl_target(request): + pa_target = pa.array(request.param, type=pa.string()) + return (pa_target, plc.interop.from_arrow(pa_target)) + + +@pytest.fixture( + scope="module", + params=[ + [ + "", + "z", + "XX", + "blahblah", + ] + ], +) +def col_repl(request): + pa_repl = pa.array(request.param, type=pa.string()) + return (pa_repl, plc.interop.from_arrow(pa_repl)) + + +@pytest.mark.parametrize("maxrepl", [-1, 1, 2, 10]) +def test_replace(data_col, scalar_repl_target, scalar_repl, maxrepl): + pa_data_col, plc_data_col = data_col + pa_target, plc_target = scalar_repl_target + pa_repl, plc_repl = scalar_repl + got = plc.strings.replace.replace( + plc_data_col, plc_target, plc_repl, maxrepl + ) + + expected = pa.compute.replace_substring( + pa_data_col, + pattern=pa_target, + replacement=pa_repl, + max_replacements=maxrepl, + ) + + assert_column_eq(expected, got) + + +@pytest.mark.parametrize("startstop", [(0, -1), (0, 0), (1, 3)]) +def test_replace_slice(data_col, scalar_repl, startstop): + pa_data_col, plc_data_col = data_col + pa_repl, plc_repl = scalar_repl + start, stop = startstop + got = plc.strings.replace.replace_slice( + plc_data_col, plc_repl, start, stop + ) + + if stop == -1: + # pyarrow doesn't support -1 as stop, so just set to really big number + + # TODO: once libcudf's count_characters() is migrated, we can call + # count_characters on the input, take the max and set stop to that + stop = 1000 + + expected = pa.compute.utf8_replace_slice(pa_data_col, start, stop, pa_repl) + + assert_column_eq(expected, got) + + +def test_replace_col(data_col, col_repl_target, col_repl): + pa_data_col, plc_data_col = data_col + pa_target, plc_target = col_repl_target + pa_repl, plc_repl = col_repl + got = plc.strings.replace.replace_multiple( + plc_data_col, plc_target, plc_repl + ) + + # There's nothing in pyarrow that does string replace with columns + # for targets/repls, so let's implement our own in python + + def replace_list(elem, targets, repls): + for target, repl in zip(targets, repls): + res = elem.replace(target, repl) + if res != elem: + return res + + targets = pa_target.to_pylist() + repls = pa_repl.to_pylist() + + expected = pa.array( + [ + replace_list(elem, targets, repls) if elem is not None else None + for elem in pa_data_col.to_pylist() + ], + type=pa.string(), + ) + + assert_column_eq(expected, got) From 57aeeb78d85e169ac18b82f51d2b1cbd01b0608d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Jun 2024 06:49:57 -1000 Subject: [PATCH 047/340] Make Frame._dtype an iterator instead of a dict (#15920) A lot of the usages of `Frame._dtype` didn't require the previous `dict` return type since that was just re-iterated over anyways. Also removed a redundant `tuple` call in `Frame._column_names` and `Frame._columns` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15920 --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/frame.py | 16 +++++++--------- python/cudf/cudf/core/groupby/groupby.py | 16 +++------------- python/cudf/cudf/core/indexed_frame.py | 10 +++++----- python/cudf/cudf/io/csv.py | 5 ++--- python/cudf/cudf/io/json.py | 5 ++--- 6 files changed, 21 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c8f1e872300..9307267b227 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1231,7 +1231,7 @@ def dtypes(self): string object dtype: object """ - return pd.Series(self._dtypes, dtype="object") + return pd.Series(dict(self._dtypes), dtype="object") @property def ndim(self) -> int: @@ -2834,7 +2834,7 @@ def reindex( return df._reindex( column_names=columns, - dtypes=self._dtypes, + dtypes=dict(self._dtypes), deep=copy, index=index, inplace=False, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7326696c994..af8886a44a6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -79,18 +79,16 @@ def _num_rows(self) -> int: return self._data.nrows @property - def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? - return tuple(self._data.names) + def _column_names(self) -> Tuple[Any, ...]: + return self._data.names @property - def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? - return tuple(self._data.columns) + def _columns(self) -> Tuple[ColumnBase, ...]: + return self._data.columns @property - def _dtypes(self): - return dict( - zip(self._data.names, (col.dtype for col in self._data.columns)) - ) + def _dtypes(self) -> abc.Iterator: + return zip(self._data.names, (col.dtype for col in self._data.columns)) @property def ndim(self) -> int: @@ -1969,7 +1967,7 @@ def __dask_tokenize__(self): return [ type(self), - str(self._dtypes), + str(dict(self._dtypes)), normalize_token(self.to_pandas()), ] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ac8b381cbec..aa96051ea51 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,12 +22,7 @@ from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default -from cudf.api.types import ( - is_bool_dtype, - is_float_dtype, - is_list_like, - is_numeric_dtype, -) +from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -335,12 +330,8 @@ def dtypes(self): FutureWarning, ) index = self.grouping.keys.unique().sort_values().to_pandas() - obj_dtypes = self.obj._dtypes return pd.DataFrame( - { - name: [obj_dtypes[name]] * len(index) - for name in self.obj._data.names - }, + {name: [dtype] * len(index) for name, dtype in self.obj._dtypes}, index=index, ) @@ -499,8 +490,7 @@ def rank( # treats NaNs the way we treat nulls. if cudf.get_option("mode.pandas_compatible"): if any( - is_float_dtype(typ) - for typ in self.grouping.values._dtypes.values() + col.dtype.kind == "f" for col in self.grouping.values._columns ): raise NotImplementedError( "NaNs are not supported in groupby.rank." diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 688b268d478..ecfcec15337 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -891,7 +891,7 @@ def replace( ) = _get_replacement_values_for_columns( to_replace=to_replace, value=value, - columns_dtype_map=self._dtypes, + columns_dtype_map=dict(self._dtypes), ) for name, col in self._data.items(): @@ -6313,11 +6313,11 @@ def __dask_tokenize__(self): return [ type(self), - str(self._dtypes), + str(dict(self._dtypes)), *[ - normalize_token(cat.categories) - for cat in self._dtypes.values() - if cat == "category" + normalize_token(col.dtype.categories) + for col in self._columns + if col.dtype == "category" ], normalize_token(self.index), normalize_token(self.hash_values().values_host), diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3eeeac405b3..f07764e2ce4 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -132,10 +132,9 @@ def read_csv( # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is None else dtype - df_dtypes = df._dtypes unspecified_dtypes = { - name: df_dtypes[name] - for name in df._column_names + name: dtype + for name, dtype in df._dtypes if name not in specified_dtypes } default_dtypes = {} diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index dd4a0d9eb07..fc3387d5117 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -147,10 +147,9 @@ def read_json( # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is True else dtype - df_dtypes = df._dtypes unspecified_dtypes = { - name: df_dtypes[name] - for name in df._column_names + name: dtype + for name, dtype in df._dtypes if name not in specified_dtypes } default_dtypes = {} From 20aa4442d27ca858796c7890ad0542dbaee542e1 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 5 Jun 2024 15:25:51 -0400 Subject: [PATCH 048/340] DOC: Add documentation for cudf.pandas in the Developer Guide (#15889) This PR provides documentation for cudf.pandas in the Developer Guide. It will describe the fast-slow proxy wrapping scheme as well as document the `CUDF_PANDAS_DEBUGGING` environment variable created in PR #15837 for issue #14975. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15889 --- .../source/developer_guide/cudf_pandas.md | 121 ++++++++++++++++++ docs/cudf/source/developer_guide/index.md | 1 + 2 files changed, 122 insertions(+) create mode 100644 docs/cudf/source/developer_guide/cudf_pandas.md diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md new file mode 100644 index 00000000000..aeb43f66b2d --- /dev/null +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -0,0 +1,121 @@ +# cudf.pandas +The use of the cuDF pandas accelerator mode (`cudf.pandas`) is explained [in the user guide](../cudf_pandas/index.rst). +The purpose of this document is to explain how the fast-slow proxy mechanism works and document internal environment variables that can be used to debug `cudf.pandas` itself. + +## fast-slow proxy mechanism +`cudf.pandas` works by wrapping each Pandas type and its corresponding cuDF type in a new proxy type also known as a fast-slow proxy type. +The purpose of proxy types is to attempt computations on the fast (cuDF) object first, and then fall back to running on the slow (Pandas) object if the fast version fails. + +### Types: +#### Wrapped Types and Proxy Types +The "wrapped" types/classes are the Pandas and cuDF specific types that have been wrapped into proxy types. +Wrapped objects and proxy objects are instances of wrapped types and proxy types, respectively. +In the snippet below `s1` and `s2` are wrapped objects and `s3` is a fast-slow proxy object. +Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas modules as attributes. + ```python + import cudf.pandas + cudf.pandas.install() + import pandas as xpd + + cudf = xpd._fsproxy_fast + pd = xpd._fsproxy_slow + + s1 = cudf.Series([1,2]) + s2 = pd.Series([1,2]) + s3 = xpd.Series([1,2]) + ``` + +```{note} +Note that users should never have to interact with the wrapped objects directly in this way. +This code is purely for demonstrative purposes. +``` + +#### The Different Kinds of Proxy Types +In `cudf.pandas`, there are two main kinds of proxy types: final types and intermediate types. + +##### Final and Intermediate Proxy Types +Final types are types for which known operations exist for converting an object of a "fast" type to a "slow" type and vice versa. +For example, `cudf.DataFrame` can be converted to Pandas using the method `to_pandas`, and `pd.DataFrame` can be converted to cuDF using the function `cudf.from_pandas`. +Intermediate types are the types of the results of operations invoked on final types. +For example, `xpd.DataFrameGroupBy` is an intermediate type that will be created during a groupby operation on the final type `xpd.DataFrame`. + +##### Attributes and Callable Proxy Types +Final proxy types are typically classes or modules, both of which have attributes. +Classes also have methods. +These attributes and methods must be wrapped as well to support the fast-slow proxy scheme. + +#### Creating New Proxy Types +`_FinalProxy` and `_IntermediateProxy` types are created using the functions `make_final_proxy_type` and `make_intermediate_proxy` type, respectively. +Creating a new final type looks like this. + +```python +DataFrame = make_final_proxy_type( + "DataFrame", + cudf.DataFrame, + pd.DataFrame, + fast_to_slow=lambda fast: fast.to_pandas(), + slow_to_fast=cudf.from_pandas, +) +``` + +### The Fallback Mechanism +Proxied calls are implemented with fallback via [`_fast_slow_function_call`](https://github.com/rapidsai/cudf/blob/57aeeb78d85e169ac18b82f51d2b1cbd01b0608d/python/cudf/cudf/pandas/fast_slow_proxy.py#L869). This implements the mechanism by which we attempt operations the fast way (using cuDF) and then fall back to the slow way (using Pandas) on failure. +The function looks like this: +```python +def _fast_slow_function_call(func: Callable, *args, **kwargs): + try: + ... + fast_args, fast_kwargs = _fast_arg(args), _fast_arg(kwargs) + result = func(*fast_args, **fast_kwargs) + ... + except Exception: + ... + slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) + result = func(*slow_args, **slow_kwargs) + ... + return _maybe_wrap_result(result, func, *args, **kwargs), fast +``` +As we can see the function attempts to call `func` the fast way using cuDF and if any `Exception` occurs, it calls the function using Pandas. +In essence, this `try-except` is what allows `cudf.pandas` to support the bulk of the Pandas API. + +At the end, the function wraps the result from either path in a fast-slow proxy object, if necessary. + +#### Converting Proxy Objects +Note that before the `func` is called, the proxy object and its attributes need to be converted to either their cuDF or Pandas implementations. +This conversion is handled in the function `_transform_arg` which both `_fast_arg` and `_slow_arg` call. + +`_transform_arg` is a recursive function that will call itself depending on the type or argument passed to it (eg. `_transform_arg` is called for each element in a list of arguments). + +### Using Metaclasses +`cudf.pandas` uses a [metaclass](https://docs.python.org/3/glossary.html#term-metaclass) called (`_FastSlowProxyMeta`) to find class attributes and classmethods of fast-slow proxy types. +For example, in the snippet below, the `xpd.Series` type is an instance of `_FastSlowProxyMeta`. +Therefore we can access the property `_fsproxy_fast` defined in the metaclass. +```python +import cudf.pandas +cudf.pandas.install() +import pandas as xpd + +print(xpd.Series._fsproxy_fast) # output is cudf.core.series.Series +``` + +## debugging `cudf.pandas` +Several environment variables are available for debugging purposes. + +Setting the environment variable `CUDF_PANDAS_DEBUGGING` produces a warning when the results from cuDF and Pandas differ from one another. +For example, the snippet below produces the warning below. +```python +import cudf.pandas +cudf.pandas.install() +import pandas as pd +import numpy as np + +setattr(pd.Series.mean, "_fsproxy_slow", lambda self, *args, **kwargs: np.float64(1)) +s = pd.Series([1,2,3]) +s.mean() +``` +``` +UserWarning: The results from cudf and pandas were different. The exception was +Arrays are not almost equal to 7 decimals + ACTUAL: 1.0 + DESIRED: 2.0. +``` diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md index 5cafa8f784c..5e099631fc5 100644 --- a/docs/cudf/source/developer_guide/index.md +++ b/docs/cudf/source/developer_guide/index.md @@ -27,4 +27,5 @@ testing benchmarking options pylibcudf +cudf_pandas ``` From d91380ef393e9156c34a078998041a6affca7923 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 5 Jun 2024 21:16:29 -0400 Subject: [PATCH 049/340] Allow tests to be built when stream util is disabled (#15933) Allows cudf to be built with `BUILD_SHARED_LIBS=OFF`, `CUDA_STATIC_RUNTIME=ON` and tests enabled Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Gera Shegalov (https://github.com/gerashegalov) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/15933 --- cpp/tests/CMakeLists.txt | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 2f2c12f265c..a0d9083c4a4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -68,12 +68,14 @@ function(ConfigureTest CMAKE_TEST_NAME) INSTALL_COMPONENT_SET testing ) - set_tests_properties( - ${CMAKE_TEST_NAME} - PROPERTIES - ENVIRONMENT - "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" - ) + if(CUDF_BUILD_STREAMS_TEST_UTIL) + set_tests_properties( + ${CMAKE_TEST_NAME} + PROPERTIES + ENVIRONMENT + "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" + ) + endif() endfunction() # ################################################################################################## @@ -401,14 +403,10 @@ ConfigureTest(SPAN_TEST utilities_tests/span_tests.cu) ConfigureTest(SPAN_TEST_DEVICE_VECTOR utilities_tests/span_tests.cu) # Overwrite the environments set by ConfigureTest -set_tests_properties( - SPAN_TEST - PROPERTIES - ENVIRONMENT - "GTEST_FILTER=-${_allowlist_filter};GTEST_CUDF_STREAM_MODE=new_cudf_default;LD_PRELOAD=$" -) -set_tests_properties( - SPAN_TEST_DEVICE_VECTOR PROPERTIES ENVIRONMENT "GTEST_FILTER=${_allowlist_filter}" +set_property( + TEST SPAN_TEST SPAN_TEST_DEVICE_VECTOR + APPEND + PROPERTY ENVIRONMENT "GTEST_FILTER=-${_allowlist_filter}" ) # ################################################################################################## @@ -671,9 +669,11 @@ target_include_directories(JIT_PARSER_TEST PRIVATE "$ Date: Wed, 5 Jun 2024 20:48:10 -0500 Subject: [PATCH 050/340] Migrate strings `contains` operations to `pylibcudf` (#15880) This PR creates pylibcudf strings `contains` APIs and migrates the cuDF cython to leverage them. Part of https://github.com/rapidsai/cudf/issues/15162. Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15880 --- .../api_docs/pylibcudf/strings/contains.rst | 6 ++ .../api_docs/pylibcudf/strings/index.rst | 1 + .../pylibcudf/libcudf/strings/CMakeLists.txt | 2 +- .../pylibcudf/libcudf/strings/regex_flags.pxd | 13 +++-- .../pylibcudf/libcudf/strings/regex_flags.pyx | 0 .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 11 +++- .../cudf/_lib/pylibcudf/strings/__init__.py | 11 +++- .../cudf/_lib/pylibcudf/strings/contains.pxd | 7 +++ .../cudf/_lib/pylibcudf/strings/contains.pyx | 41 ++++++++++++++ .../_lib/pylibcudf/strings/regex_flags.pxd | 2 + .../_lib/pylibcudf/strings/regex_flags.pyx | 4 ++ .../_lib/pylibcudf/strings/regex_program.pxd | 10 ++++ .../_lib/pylibcudf/strings/regex_program.pyx | 37 +++++++++++++ python/cudf/cudf/_lib/strings/contains.pyx | 23 +++----- .../pylibcudf_tests/test_regex_program.py | 13 +++++ .../pylibcudf_tests/test_string_contains.py | 55 +++++++++++++++++++ 17 files changed, 215 insertions(+), 25 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_regex_program.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_contains.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst new file mode 100644 index 00000000000..e5745331bc7 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst @@ -0,0 +1,6 @@ +======== +contains +======== + +.. automodule:: cudf._lib.pylibcudf.strings.contains + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 8970fc80c0b..bfaef732555 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -4,4 +4,5 @@ strings .. toctree:: :maxdepth: 1 + contains replace diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt index 930c22781d0..bd6e2e0af02 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx) +set(cython_sources char_types.pyx regex_flags.pyx) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd index 2a5701fa6a3..41617f157b7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd @@ -1,9 +1,12 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t + cdef extern from "cudf/strings/regex/flags.hpp" \ namespace "cudf::strings" nogil: - ctypedef enum regex_flags: - DEFAULT 'cudf::strings::regex_flags::DEFAULT' - MULTILINE 'cudf::strings::regex_flags::MULTILINE' - DOTALL 'cudf::strings::regex_flags::DOTALL' + cpdef enum class regex_flags(int32_t): + DEFAULT + MULTILINE + DOTALL diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index c9a983e24f4..cb7f71b1912 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx + regex_program.pyx replace.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 7563df8a107..959aa94737d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, char_types, find, replace +from . cimport ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index cb4f0e38f97..b7384913286 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, char_types, find, replace +from . import ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd new file mode 100644 index 00000000000..275aa95d97e --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re(Column input, RegexProgram prog) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx new file mode 100644 index 00000000000..8c598b7c953 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which match the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::contains_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd new file mode 100644 index 00000000000..79937bf574a --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..903c2ddd503 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \ + regex_flags as RegexFlags # no-cython-lint diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd new file mode 100644 index 00000000000..61ed268fb2d --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + + +cdef class RegexProgram: + cdef unique_ptr[regex_program] c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx new file mode 100644 index 00000000000..d605b0aba02 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + +from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags +from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags + + +cdef class RegexProgram: + + def __init__(self, *args, **kwargs): + raise ValueError("Do not instantiate RegexProgram directly, use create") + + @staticmethod + def create(str pattern, int flags): + cdef unique_ptr[regex_program] c_prog + cdef regex_flags c_flags + cdef string c_pattern = pattern.encode() + + cdef RegexProgram ret = RegexProgram.__new__(RegexProgram) + if isinstance(flags, object): + if isinstance(flags, (int, RegexFlags)): + c_flags = flags + with nogil: + c_prog = regex_program.create(c_pattern, c_flags) + + ret.c_obj = move(c_prog) + else: + raise ValueError("flags must be of type RegexFlags") + + return ret diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 087acd8062d..502a1d14696 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.libcudf.strings.contains cimport ( - contains_re as cpp_contains_re, count_re as cpp_count_re, like as cpp_like, matches_re as cpp_matches_re, @@ -23,6 +22,9 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program from cudf._lib.scalar cimport DeviceScalar +from cudf._lib.pylibcudf.strings import contains +from cudf._lib.pylibcudf.strings.regex_program import RegexProgram + @acquire_spill_lock() def contains_re(Column source_strings, object reg_ex, uint32_t flags): @@ -30,21 +32,10 @@ def contains_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column of boolean values with True for `source_strings` that contain regular expression `reg_ex`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_contains_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py new file mode 100644 index 00000000000..3a9bcec3616 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize("pat", ["(", "*", "\\"]) +def test_regex_program_invalid(pat): + with pytest.raises(RuntimeError): + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py new file mode 100644 index 00000000000..8cdb6f7c521 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def pa_target_col(): + return pa.array( + ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] + ) + + +@pytest.fixture(scope="module") +def plc_target_col(pa_target_col): + return plc.interop.from_arrow(pa_target_col) + + +@pytest.fixture( + params=[ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], + scope="module", +) +def pa_target_scalar(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture(scope="module") +def plc_target_pat(pa_target_scalar): + prog = plc.strings.regex_program.RegexProgram.create( + pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT + ) + return prog + + +def test_contains_re( + pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat +): + got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) + expected = pa.compute.match_substring_regex( + pa_target_col, pa_target_scalar.as_py() + ) + assert_column_eq(got, expected) From 3b734ec2fd591f037fe1d8f8ce424c7049cb5a3e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jun 2024 04:41:01 -0700 Subject: [PATCH 051/340] Start migrating I/O to pylibcudf (#15899) xref #15162 Starts migrating cudf I/O cython to use pylibcudf APIs, starting with avro. Authors: - Thomas Li (https://github.com/lithomas1) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15899 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../user_guide/api_docs/pylibcudf/io/avro.rst | 6 + .../api_docs/pylibcudf/io/index.rst | 18 +++ python/cudf/cudf/_lib/avro.pyx | 50 ++----- python/cudf/cudf/_lib/csv.pyx | 8 +- python/cudf/cudf/_lib/parquet.pyx | 2 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + .../cudf/_lib/pylibcudf/io/CMakeLists.txt | 25 ++++ .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd | 4 + .../cudf/cudf/_lib/pylibcudf/io/__init__.py | 4 + python/cudf/cudf/_lib/pylibcudf/io/avro.pxd | 12 ++ python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 58 +++++++++ python/cudf/cudf/_lib/pylibcudf/io/types.pxd | 29 +++++ python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 110 ++++++++++++++++ .../cudf/_lib/pylibcudf/libcudf/io/orc.pxd | 6 +- .../cudf/_lib/pylibcudf/libcudf/io/types.pxd | 58 ++++----- python/cudf/cudf/_lib/utils.pxd | 1 + python/cudf/cudf/_lib/utils.pyx | 11 ++ .../cudf/cudf/pylibcudf_tests/common/utils.py | 17 +++ python/cudf/cudf/pylibcudf_tests/test_avro.py | 123 ++++++++++++++++++ .../cudf/pylibcudf_tests/test_source_info.py | 69 ++++++++++ 21 files changed, 541 insertions(+), 72 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.py create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_avro.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_source_info.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index b6ad1157511..870ed8856d1 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -17,6 +17,7 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby + io/index.rst join lists merge diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst new file mode 100644 index 00000000000..495bd505fdc --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst @@ -0,0 +1,6 @@ +==== +Avro +==== + +.. automodule:: cudf._lib.pylibcudf.io.avro + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst new file mode 100644 index 00000000000..0d53ac92db9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -0,0 +1,18 @@ +=== +I/O +=== + +I/O Utility Classes +=================== + +.. automodule:: cudf._lib.pylibcudf.io.types + :members: + + +I/O Functions +============= + +.. toctree:: + :maxdepth: 1 + + avro diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx index ae17a5f1ab6..3c132b22880 100644 --- a/python/cudf/cudf/_lib/avro.pyx +++ b/python/cudf/cudf/_lib/avro.pyx @@ -1,20 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector +from cudf._lib.utils cimport data_from_pylibcudf_io -from cudf._lib.io.utils cimport make_source_info -from cudf._lib.pylibcudf.libcudf.io.avro cimport ( - avro_reader_options, - read_avro as libcudf_read_avro, -) -from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata -from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.utils cimport data_from_unique_ptr +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.types import SourceInfo -cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): +cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1): """ Cython function to call libcudf read_avro, see `read_avro`. @@ -28,28 +20,14 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): if not isinstance(num_rows, int) or num_rows < -1: raise TypeError("num_rows must be an int >= -1") - if not isinstance(skip_rows, int) or skip_rows < -1: - raise TypeError("skip_rows must be an int >= -1") - - cdef vector[string] c_columns - if columns is not None and len(columns) > 0: - c_columns.reserve(len(columns)) - for col in columns: - c_columns.push_back(str(col).encode()) - - cdef avro_reader_options options = move( - avro_reader_options.builder(make_source_info([datasource])) - .columns(c_columns) - .skip_rows( skip_rows) - .num_rows( num_rows) - .build() + if not isinstance(skip_rows, int) or skip_rows < 0: + raise TypeError("skip_rows must be an int >= 0") + + return data_from_pylibcudf_io( + plc.io.avro.read_avro( + SourceInfo([datasource]), + columns, + skip_rows, + num_rows + ) ) - - cdef table_with_metadata c_result - - with nogil: - c_result = move(libcudf_read_avro(options)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] - - return data_from_unique_ptr(move(c_result.tbl), column_names=names) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index aa771295607..0b0bbdb2589 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -151,14 +151,14 @@ cdef csv_reader_options make_csv_reader_options( ) if quoting == 1: - c_quoting = quote_style.QUOTE_ALL + c_quoting = quote_style.ALL elif quoting == 2: - c_quoting = quote_style.QUOTE_NONNUMERIC + c_quoting = quote_style.NONNUMERIC elif quoting == 3: - c_quoting = quote_style.QUOTE_NONE + c_quoting = quote_style.NONE else: # Default value - c_quoting = quote_style.QUOTE_MINIMAL + c_quoting = quote_style.MINIMAL cdef csv_reader_options csv_reader_options_c = move( csv_reader_options.builder(c_source_info) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index f0eef9be124..ac592cedaac 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -491,7 +491,7 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - dict_policy = ( + cdef cudf_io_types.dictionary_policy dict_policy = ( cudf_io_types.dictionary_policy.ADAPTIVE if use_dictionary else cudf_io_types.dictionary_policy.NEVER diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 7d0676f6def..6beb7b0f506 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -50,3 +50,4 @@ link_to_pyarrow_headers(pylibcudf_interop) add_subdirectory(libcudf) add_subdirectory(strings) +add_subdirectory(io) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt new file mode 100644 index 00000000000..2cfec101bab --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources avro.pyx types.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf +) + +set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types) +link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd new file mode 100644 index 00000000000..250292746c1 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport avro, types +from .types cimport SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py new file mode 100644 index 00000000000..5242c741911 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import avro, types +from .types import SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd new file mode 100644 index 00000000000..3695f36a6e7 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = *, + size_type skip_rows = *, + size_type num_rows = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx new file mode 100644 index 00000000000..946e0896fc8 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport ( + avro_reader_options, + read_avro as cpp_read_avro, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = None, + size_type skip_rows = 0, + size_type num_rows = -1 +): + """ + Reads an Avro dataset into a set of columns. + + Parameters + ---------- + source_info: SourceInfo + The SourceInfo object to read the avro dataset from. + columns: list, default None + Optional columns to read, if not provided, reads all columns in the file. + skip_rows: size_type, default 0 + The number of rows to skip. + num_rows: size_type, default -1 + The number of rows to read, after skipping rows. + If -1 is passed, all rows will be read. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata that was read in. + """ + cdef vector[string] c_columns + if columns is not None and len(columns) > 0: + c_columns.reserve(len(columns)) + for col in columns: + c_columns.push_back(str(col).encode()) + + cdef avro_reader_options avro_opts = move( + avro_reader_options.builder(source_info.c_obj) + .columns(c_columns) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() + ) + + with nogil: + c_result = move(cpp_read_avro(avro_opts)) + + return TableWithMetadata.from_libcudf(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd new file mode 100644 index 00000000000..aa846a47343 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + column_encoding, + column_in_metadata, + column_name_info, + compression_type, + dictionary_policy, + io_type, + partition_info, + quote_style, + sink_info, + source_info, + statistics_freq, + table_input_metadata, + table_metadata, + table_with_metadata, +) +from cudf._lib.pylibcudf.table cimport Table + + +cdef class TableWithMetadata: + cdef public Table tbl + cdef table_metadata metadata + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) + +cdef class SourceInfo: + cdef source_info c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx new file mode 100644 index 00000000000..cd777232b33 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + host_buffer, + source_info, + table_with_metadata, +) + +import errno +import io +import os + + +cdef class TableWithMetadata: + """A container holding a table and its associated metadata + (e.g. column names) + + For details, see :cpp:class:`cudf::io::table_with_metadata`. + """ + + @property + def columns(self): + """ + Return a list containing the columns of the table + """ + return self.tbl.columns() + + @property + def column_names(self): + """ + Return a list containing the column names of the table + """ + cdef list names = [] + for col_info in self.metadata.schema_info: + # TODO: Handle nesting (columns with child columns) + assert col_info.children.size() == 0, "Child column names are not handled!" + names.append(col_info.name.decode()) + return names + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta): + """Create a Python TableWithMetadata from a libcudf table_with_metadata""" + cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata) + out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl)) + out.metadata = tbl_with_meta.metadata + return out + +cdef class SourceInfo: + """A class containing details on a source to read from. + + For details, see :cpp:class:`cudf::io::source_info`. + + Parameters + ---------- + sources : List[Union[str, os.PathLike, bytes, io.BytesIO]] + A homogeneous list of sources (this can be a string filename, + an os.PathLike, bytes, or an io.BytesIO) to read from. + + Mixing different types of sources will raise a `ValueError`. + """ + + def __init__(self, list sources): + if not sources: + raise ValueError("Need to pass at least one source") + + cdef vector[string] c_files + + if isinstance(sources[0], (os.PathLike, str)): + c_files.reserve(len(sources)) + + for src in sources: + if not isinstance(src, (os.PathLike, str)): + raise ValueError("All sources must be of the same type!") + if not os.path.isfile(src): + raise FileNotFoundError(errno.ENOENT, + os.strerror(errno.ENOENT), + src) + + c_files.push_back( str(src).encode()) + + self.c_obj = move(source_info(c_files)) + return + + # TODO: host_buffer is deprecated API, use host_span instead + cdef vector[host_buffer] c_host_buffers + cdef const unsigned char[::1] c_buffer + cdef bint empty_buffer = False + if isinstance(sources[0], bytes): + empty_buffer = True + for buffer in sources: + if not isinstance(buffer, bytes): + raise ValueError("All sources must be of the same type!") + if (len(buffer) > 0): + c_buffer = buffer + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + empty_buffer = False + elif isinstance(sources[0], io.BytesIO): + for bio in sources: + if not isinstance(bio, io.BytesIO): + raise ValueError("All sources must be of the same type!") + c_buffer = bio.getbuffer() # check if empty? + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + + self.c_obj = source_info(c_host_buffers) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd index e553515dfdf..25f91849dea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd @@ -94,7 +94,9 @@ cdef extern from "cudf/io/orc.hpp" \ orc_writer_options_builder& compression( cudf_io_types.compression_type comp ) except + - orc_writer_options_builder& enable_statistics(bool val) except + + orc_writer_options_builder& enable_statistics( + cudf_io_types.statistics_freq val + ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + orc_writer_options_builder& row_index_stride(size_type val) except + @@ -147,7 +149,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type comp ) except + chunked_orc_writer_options_builder& enable_statistics( - bool val + cudf_io_types.statistics_freq val ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd index 38fae1df1e5..8d87deb1472 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd @@ -20,45 +20,45 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/io/types.hpp" \ namespace "cudf::io" nogil: - ctypedef enum quote_style: - QUOTE_MINIMAL "cudf::io::quote_style::MINIMAL" - QUOTE_ALL "cudf::io::quote_style::ALL" - QUOTE_NONNUMERIC "cudf::io::quote_style::NONNUMERIC" - QUOTE_NONE "cudf::io::quote_style::NONE" - - ctypedef enum compression_type: - NONE "cudf::io::compression_type::NONE" - AUTO "cudf::io::compression_type::AUTO" - SNAPPY "cudf::io::compression_type::SNAPPY" - GZIP "cudf::io::compression_type::GZIP" - BZIP2 "cudf::io::compression_type::BZIP2" - BROTLI "cudf::io::compression_type::BROTLI" - ZIP "cudf::io::compression_type::ZIP" - XZ "cudf::io::compression_type::XZ" - ZLIB "cudf::io::compression_type::ZLIB" - LZ4 "cudf::io::compression_type::LZ4" - LZO "cudf::io::compression_type::LZO" - ZSTD "cudf::io::compression_type::ZSTD" - - ctypedef enum io_type: - FILEPATH "cudf::io::io_type::FILEPATH" - HOST_BUFFER "cudf::io::io_type::HOST_BUFFER" - VOID "cudf::io::io_type::VOID" - USER_IMPLEMENTED "cudf::io::io_type::USER_IMPLEMENTED" - - ctypedef enum statistics_freq: + cpdef enum class quote_style(int32_t): + MINIMAL + ALL + NONNUMERIC + NONE + + cpdef enum class compression_type(int32_t): + NONE + AUTO + SNAPPY + GZIP + BZIP2 + BROTLI + ZIP + XZ + ZLIB + LZ4 + LZO + ZSTD + + cpdef enum class io_type(int32_t): + FILEPATH + HOST_BUFFER + VOID + USER_IMPLEMENTED + + cpdef enum class statistics_freq(int32_t): STATISTICS_NONE = 0, STATISTICS_ROWGROUP = 1, STATISTICS_PAGE = 2, STATISTICS_COLUMN = 3, - ctypedef enum dictionary_policy: + cpdef enum class dictionary_policy(int32_t): NEVER = 0, ADAPTIVE = 1, ALWAYS = 2, cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil: - cpdef enum class column_encoding: + cpdef enum class column_encoding(int32_t): USE_DEFAULT = -1 DICTIONARY = 0 PLAIN = 1 diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index c5a1e7552b9..99850d549a1 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -11,6 +11,7 @@ from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) +cdef data_from_pylibcudf_io(tbl_with_meta) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 4c4cd48d6ed..de6b9f690b6 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -315,6 +315,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): index_names ) +cdef data_from_pylibcudf_io(tbl_with_meta): + """ + Unpacks the TableWithMetadata from libcudf I/O + into a dict of columns and an Index (cuDF format) + """ + return _data_from_columns( + columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], + column_names=tbl_with_meta.column_names, + index_names=None + ) + cdef columns_from_table_view( table_view tv, object owners, diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index e00053529a8..54d38f1a8cf 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -63,6 +63,23 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: assert_column_eq(pa_col, plc_col) +def assert_table_and_meta_eq( + plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table +) -> None: + """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal""" + + plc_table = plc_table_w_meta.tbl + + plc_shape = (plc_table.num_rows(), plc_table.num_columns()) + assert plc_shape == pa_table.shape + + for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + assert_column_eq(plc_col, pa_col) + + # Check column name equality + assert plc_table_w_meta.column_names == pa_table.column_names + + def cudf_raises(expected_exception: BaseException, *args, **kwargs): # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions match = kwargs.get("match", None) diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/test_avro.py new file mode 100644 index 00000000000..d6cd86768cd --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_avro.py @@ -0,0 +1,123 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io +import itertools + +import fastavro +import pyarrow as pa +import pytest +from utils import assert_table_and_meta_eq + +import cudf._lib.pylibcudf as plc + +avro_dtype_pairs = [ + ("boolean", pa.bool_()), + ("int", pa.int32()), + ("long", pa.int64()), + ("float", pa.float32()), + ("double", pa.float64()), + ("bytes", pa.string()), + ("string", pa.string()), +] + + +@pytest.fixture( + scope="module", params=itertools.combinations(avro_dtype_pairs, 2) +) +def avro_dtypes(request): + return request.param + + +@pytest.fixture +def avro_dtype_data(avro_dtypes): + (avro_type1, _), (avro_type2, _) = avro_dtypes + + def _get_data(avro_type): + if avro_type == "boolean": + return [True, False, True] + elif avro_type in {"int", "long"}: + return [1, 2, -1] + elif avro_type in {"float", "double"}: + return [1.0, 3.1415, -3.1415] + elif avro_type == "bytes": + return [b"a", b"b", b"c"] + elif avro_type == "string": + return ["Hello", "World!", ""] + + return _get_data(avro_type1), _get_data(avro_type2) + + +@pytest.fixture( + params=[ + (0, 0), + (0, -1), + (1, -1), + (3, -1), + ] +) +def row_opts(request): + """ + (skip_rows, num_rows) combos for the avro reader + """ + return request.param + + +@pytest.mark.parametrize("columns", [["prop1"], [], ["prop1", "prop2"]]) +@pytest.mark.parametrize("nullable", [True, False]) +def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): + (avro_type1, expected_type1), (avro_type2, expected_type2) = avro_dtypes + + avro_type1 = avro_type1 if not nullable else ["null", avro_type1] + avro_type2 = avro_type2 if not nullable else ["null", avro_type2] + + skip_rows, num_rows = row_opts + + schema = fastavro.parse_schema( + { + "type": "record", + "name": "test", + "fields": [ + {"name": "prop1", "type": avro_type1}, + {"name": "prop2", "type": avro_type2}, + ], + } + ) + + if nullable: + avro_dtype_data = ( + avro_dtype_data[0] + [None], + avro_dtype_data[1] + [None], + ) + + records = [ + {"prop1": val1, "prop2": val2} for val1, val2 in zip(*avro_dtype_data) + ] + + buffer = io.BytesIO() + fastavro.writer(buffer, schema, records) + buffer.seek(0) + + res = plc.io.avro.read_avro( + plc.io.types.SourceInfo([buffer]), + columns=columns, + skip_rows=skip_rows, + num_rows=num_rows, + ) + + expected = pa.Table.from_arrays( + [ + pa.array(avro_dtype_data[0], type=expected_type1), + pa.array(avro_dtype_data[1], type=expected_type2), + ], + names=["prop1", "prop2"], + ) + + # Adjust for skip_rows/num_rows in result + length = num_rows if num_rows != -1 else None + expected = expected.slice(skip_rows, length=length) + + # adjust for # of columns + if columns != []: + expected = expected.select(columns) + + assert_table_and_meta_eq(res, expected) diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py new file mode 100644 index 00000000000..71a3ecbcc30 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize( + "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")] +) +def test_source_info_ctor(source, tmp_path): + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + source = str(file) + + plc.io.SourceInfo([source]) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["a.txt", "a.txt"], + [b"hello world", b"hello there"], + [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")], + ], +) +def test_source_info_ctor_multiple(sources, tmp_path): + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + + plc.io.SourceInfo(sources) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["awef.txt", b"hello world", io.BytesIO(b"hello world")], + [b"hello world", b"hello there", "awef.txt"], + [ + io.BytesIO(b"hello world"), + io.BytesIO(b"hello there"), + b"hello world", + ], + ], +) +def test_source_info_ctor_mixing_invalid(sources, tmp_path): + # Unlike the previous test + # don't create files so that they are missing + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + with pytest.raises(ValueError): + plc.io.SourceInfo(sources) From d1e511edc88deb7604bed71b2689d72da0aed19a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 6 Jun 2024 15:19:06 +0100 Subject: [PATCH 052/340] Introduce `NamedColumn` concept in cudf-polars (#15914) Simplify name tracking in expression evaluation by only requiring names for columns when putting them in to a `DataFrame`. At the same time, this allows us to have one place where we broadcast-expand `Scalar`s to the size of the `DataFrame`, so we can expunge tracking them in the `DataFrame` itself. Additionally, adapt to minor changes on the polars side in terms of translating the DSL: we no longer need to handle CSE expressions specially, and sorting by multiple keys takes a list of `descending` flags, rather than a single bool as previously. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15914 --- .../cudf_polars/containers/__init__.py | 4 +- .../cudf_polars/containers/column.py | 78 ++++-- .../cudf_polars/containers/dataframe.py | 59 ++--- python/cudf_polars/cudf_polars/dsl/expr.py | 239 +++++++++++------- python/cudf_polars/cudf_polars/dsl/ir.py | 176 ++++++++----- .../cudf_polars/cudf_polars/dsl/translate.py | 106 +++++--- .../cudf_polars/testing/asserts.py | 6 +- .../cudf_polars/cudf_polars/utils/dtypes.py | 3 +- .../cudf_polars/cudf_polars/utils/sorting.py | 12 +- python/cudf_polars/docs/overview.md | 101 +++++++- .../cudf_polars/tests/expressions/test_agg.py | 6 +- python/cudf_polars/tests/test_select.py | 21 ++ python/cudf_polars/tests/test_union.py | 5 - 13 files changed, 541 insertions(+), 275 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index ef9d9ca61b6..ee69e748eb5 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,8 +5,8 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column", "Scalar"] +__all__: list[str] = ["DataFrame", "Column", "NamedColumn", "Scalar"] -from cudf_polars.containers.column import Column +from cudf_polars.containers.column import Column, NamedColumn from cudf_polars.containers.dataframe import DataFrame from cudf_polars.containers.scalar import Scalar diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 49034b5f5c8..575d15d3ece 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -13,24 +13,29 @@ if TYPE_CHECKING: from typing_extensions import Self -__all__: list[str] = ["Column"] +__all__: list[str] = ["Column", "NamedColumn"] class Column: - """A column, a name, and sortedness.""" + """A column with sortedness metadata.""" obj: plc.Column - name: str is_sorted: plc.types.Sorted order: plc.types.Order null_order: plc.types.NullOrder - def __init__(self, column: plc.Column, name: str): + def __init__( + self, + column: plc.Column, + *, + is_sorted: plc.types.Sorted = plc.types.Sorted.NO, + order: plc.types.Order = plc.types.Order.ASCENDING, + null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, + ): self.obj = column - self.name = name - self.is_sorted = plc.types.Sorted.NO - self.order = plc.types.Order.ASCENDING - self.null_order = plc.types.NullOrder.BEFORE + self.is_sorted = is_sorted + self.order = order + self.null_order = null_order def sorted_like(self, like: Column, /) -> Self: """ @@ -81,22 +86,20 @@ def set_sorted( self.null_order = null_order return self - def copy(self, *, new_name: str | None = None) -> Self: + def copy(self) -> Self: """ - Return a shallow copy of the column. - - Parameters - ---------- - new_name - Optional new name for the copied column. + A shallow copy of the column. Returns ------- New column sharing data with self. """ return type(self)( - self.obj, self.name if new_name is None else new_name - ).sorted_like(self) + self.obj, + is_sorted=self.is_sorted, + order=self.order, + null_order=self.null_order, + ) def mask_nans(self) -> Self: """Return a copy of self with nans masked out.""" @@ -117,3 +120,44 @@ def nan_count(self) -> int: plc.DataType(plc.TypeId.INT32), ) ).as_py() + + +class NamedColumn(Column): + """A column with a name.""" + + name: str + + def __init__( + self, + column: plc.Column, + name: str, + *, + is_sorted: plc.types.Sorted = plc.types.Sorted.NO, + order: plc.types.Order = plc.types.Order.ASCENDING, + null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, + ) -> None: + super().__init__( + column, is_sorted=is_sorted, order=order, null_order=null_order + ) + self.name = name + + def copy(self, *, new_name: str | None = None) -> Self: + """ + A shallow copy of the column. + + Parameters + ---------- + new_name + Optional new name for the copied column. + + Returns + ------- + New column sharing data with self. + """ + return type(self)( + self.obj, + self.name if new_name is None else new_name, + is_sorted=self.is_sorted, + order=self.order, + null_order=self.null_order, + ) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index de21a280020..eeaf181be0c 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -12,7 +12,7 @@ import cudf._lib.pylibcudf as plc -from cudf_polars.containers.column import Column +from cudf_polars.containers.column import NamedColumn if TYPE_CHECKING: from collections.abc import Mapping, Sequence, Set @@ -21,7 +21,7 @@ import cudf - from cudf_polars.containers.scalar import Scalar + from cudf_polars.containers import Column __all__: list[str] = ["DataFrame"] @@ -30,26 +30,20 @@ class DataFrame: """A representation of a dataframe.""" - columns: list[Column] - scalars: list[Scalar] + columns: list[NamedColumn] table: plc.Table | None - def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None: + def __init__(self, columns: Sequence[NamedColumn]) -> None: self.columns = list(columns) self._column_map = {c.name: c for c in self.columns} - self.scalars = list(scalars) - if len(scalars) == 0: - self.table = plc.Table([c.obj for c in columns]) - else: - self.table = None + self.table = plc.Table([c.obj for c in columns]) def copy(self) -> Self: """Return a shallow copy of self.""" - return type(self)(self.columns, self.scalars) + return type(self)(self.columns) def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" - assert len(self.scalars) == 0 return pl.from_arrow( plc.interop.to_arrow( self.table, @@ -83,8 +77,10 @@ def num_rows(self) -> int: def from_cudf(cls, df: cudf.DataFrame) -> Self: """Create from a cudf dataframe.""" return cls( - [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()], - [], + [ + NamedColumn(c.to_pylibcudf(mode="read"), name) + for name, c in df._data.items() + ] ) @classmethod @@ -105,13 +101,16 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: Raises ------ - ValueError if the number of provided names does not match the - number of columns in the table. + ValueError + If the number of provided names does not match the + number of columns in the table. """ - # TODO: strict=True when we drop py39 if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") - return cls([Column(c, name) for c, name in zip(table.columns(), names)], []) + return cls( + # TODO: strict=True when we drop py39 + [NamedColumn(c, name) for c, name in zip(table.columns(), names)] + ) def sorted_like( self, like: DataFrame, /, *, subset: Set[str] | None = None @@ -132,18 +131,20 @@ def sorted_like( Raises ------ - ValueError if there is a name mismatch between self and like. + ValueError + If there is a name mismatch between self and like. """ if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset self.columns = [ c.sorted_like(other) if c.name in subset else c + # TODO: strict=True when we drop py39 for c, other in zip(self.columns, like.columns) ] return self - def with_columns(self, columns: Sequence[Column]) -> Self: + def with_columns(self, columns: Sequence[NamedColumn]) -> Self: """ Return a new dataframe with extra columns. @@ -160,35 +161,31 @@ def with_columns(self, columns: Sequence[Column]) -> Self: ----- If column names overlap, newer names replace older ones. """ - return type(self)([*self.columns, *columns], self.scalars) + return type(self)([*self.columns, *columns]) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" - return type(self)( - [c for c in self.columns if c.name not in names], self.scalars - ) + return type(self)([c for c in self.columns if c.name not in names]) def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" want = set(names) if not want.issubset(self.column_names_set): raise ValueError("Can't select missing names") - return type(self)([self._column_map[name] for name in names], self.scalars) + return type(self)([self._column_map[name] for name in names]) - def replace_columns(self, *columns: Column) -> Self: + def replace_columns(self, *columns: NamedColumn) -> Self: """Return a new dataframe with columns replaced by name.""" new = {c.name: c for c in columns} if not set(new).issubset(self.column_names_set): raise ValueError("Cannot replace with non-existing names") - return type(self)([new.get(c.name, c) for c in self.columns], self.scalars) + return type(self)([new.get(c.name, c) for c in self.columns]) def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" - return type(self)( - [c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars - ) + return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns]) - def select_columns(self, names: Set[str]) -> list[Column]: + def select_columns(self, names: Set[str]) -> list[NamedColumn]: """Select columns by name.""" return [c for c in self.columns if c.name in names] diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 7187a36f21c..c7c11cf6c68 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -26,11 +26,11 @@ import cudf._lib.pylibcudf as plc -from cudf_polars.containers import Column, Scalar +from cudf_polars.containers import Column, NamedColumn, Scalar from cudf_polars.utils import sorting if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Mapping, Sequence import polars.type_aliases as pl_types @@ -110,7 +110,7 @@ def get_hash(self) -> int: """ return hash((type(self), self._ctor_arguments(self.children))) - def __hash__(self): + def __hash__(self) -> int: """Hash of an expression with caching.""" try: return self._hash_value @@ -139,18 +139,18 @@ def is_equal(self, other: Any) -> bool: other.children ) - def __eq__(self, other): + def __eq__(self, other) -> bool: """Equality of expressions.""" if type(self) != type(other) or hash(self) != hash(other): return False else: return self.is_equal(other) - def __ne__(self, other): + def __ne__(self, other) -> bool: """Inequality of expressions.""" return not self.__eq__(other) - def __repr__(self): + def __repr__(self) -> str: """String representation of an expression with caching.""" try: return self._repr_value @@ -164,7 +164,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: # TODO: return type is a lie for Literal """ Evaluate this expression given a dataframe for context. @@ -185,15 +185,6 @@ def do_evaluate( Do not call this function directly, but rather :meth:`evaluate` which handles the mapping lookups. - The typed return value of :class:`Column` is not true when - evaluating :class:`Literal` nodes (which instead produce - :class:`Scalar` objects). However, these duck-type to having a - pylibcudf container object inside them, and usually they end - up appearing in binary expressions which pylibcudf handles - appropriately since there are overloads for (column, scalar) - pairs. We don't have to handle (scalar, scalar) in binops - since the polars optimizer has a constant-folding pass. - Returns ------- Column representing the evaluation of the expression (or maybe @@ -201,9 +192,10 @@ def do_evaluate( Raises ------ - NotImplementedError if we couldn't evaluate the expression. - Ideally all these are returned during translation to the IR, - but for now we are not perfect. + NotImplementedError + If we couldn't evaluate the expression. Ideally all these + are returned during translation to the IR, but for now we + are not perfect. """ raise NotImplementedError(f"Evaluation of {type(self).__name__}") @@ -212,7 +204,7 @@ def evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: # TODO: return type is a lie for Literal """ Evaluate this expression given a dataframe for context. @@ -234,16 +226,26 @@ def evaluate( this method provides logic to handle lookups in the substitution mapping. + The typed return value of :class:`Column` is not true when + evaluating :class:`Literal` nodes (which instead produce + :class:`Scalar` objects). However, these duck-type to having a + pylibcudf container object inside them, and usually they end + up appearing in binary expressions which pylibcudf handles + appropriately since there are overloads for (column, scalar) + pairs. We don't have to handle (scalar, scalar) in binops + since the polars optimizer has a constant-folding pass. + Returns ------- Column representing the evaluation of the expression (or maybe - a scalar, annoying!). + a scalar). Raises ------ - NotImplementedError if we couldn't evaluate the expression. - Ideally all these are returned during translation to the IR, - but for now we are not perfect. + NotImplementedError + If we couldn't evaluate the expression. Ideally all these + are returned during translation to the IR, but for now we + are not perfect. """ if mapping is None: return self.do_evaluate(df, context=context, mapping=mapping) @@ -269,41 +271,74 @@ def collect_agg(self, *, depth: int) -> AggInfo: Raises ------ - NotImplementedError if we can't currently perform the - aggregation request (for example nested aggregations like - ``a.max().min()``). + NotImplementedError + If we can't currently perform the aggregation request, for + example nested aggregations like ``a.max().min()``. """ raise NotImplementedError( f"Collecting aggregation info for {type(self).__name__}" ) -class NamedExpr(Expr): - __slots__ = ("name", "children") - _non_child = ("dtype", "name") +class NamedExpr: + # NamedExpr does not inherit from Expr since it does not appear + # when evaluating expressions themselves, only when constructing + # named return values in dataframe (IR) nodes. + __slots__ = ("name", "value") - def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None: - super().__init__(dtype) + def __init__(self, name: str, value: Expr) -> None: self.name = name - self.children = (value,) + self.value = value + + def __hash__(self) -> int: + """Hash of the expression.""" + return hash((type(self), self.name, self.value)) + + def __repr__(self) -> str: + """Repr of the expression.""" + return f"NamedExpr({self.name}, {self.value}" + + def __eq__(self, other) -> bool: + """Equality of two expressions.""" + return ( + type(self) is type(other) + and self.name == other.name + and self.value == other.value + ) - def do_evaluate( + def __ne__(self, other) -> bool: + """Inequality of expressions.""" + return not self.__eq__(other) + + def evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, - ) -> Column: + mapping: Mapping[Expr, Column] | None = None, + ) -> NamedColumn: """Evaluate this expression given a dataframe for context.""" - (child,) = self.children - return Column( - child.evaluate(df, context=context, mapping=mapping).obj, self.name - ) + obj = self.value.evaluate(df, context=context, mapping=mapping) + if isinstance(obj, Scalar): + return NamedColumn( + plc.Column.from_scalar(obj.obj, 1), + self.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, + ) + else: + return NamedColumn( + obj.obj, + self.name, + is_sorted=obj.is_sorted, + order=obj.order, + null_order=obj.null_order, + ) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" - (value,) = self.children - return value.collect_agg(depth=depth) + return self.value.collect_agg(depth=depth) class Literal(Expr): @@ -311,21 +346,21 @@ class Literal(Expr): _non_child = ("dtype", "value") value: pa.Scalar - def __init__(self, dtype: plc.DataType, value: Any) -> None: + def __init__(self, dtype: plc.DataType, value: pa.Scalar) -> None: super().__init__(dtype) - self.value = pa.scalar(value) + assert value.type == plc.interop.to_arrow(dtype) + self.value = value def do_evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - # TODO: obey dtype - obj = plc.interop.from_arrow(self.value) - return Scalar(obj) # type: ignore + # datatype of pyarrow scalar is correct by construction. + return Scalar(plc.interop.from_arrow(self.value)) # type: ignore class Col(Expr): @@ -342,7 +377,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" return df._column_map[self.name] @@ -358,7 +393,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" # TODO: type is wrong, and dtype @@ -415,8 +450,7 @@ def _distinct( [source_value], indices, plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]), - ).columns()[0], - column.name, + ).columns()[0] ) _BETWEEN_OPS: ClassVar[ @@ -448,7 +482,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" columns = [ @@ -467,18 +501,18 @@ def do_evaluate( ) if self.name == pl_expr.BooleanFunction.IsNull: (column,) = columns - return Column(plc.unary.is_null(column.obj), column.name) + return Column(plc.unary.is_null(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNotNull: (column,) = columns - return Column(plc.unary.is_valid(column.obj), column.name) + return Column(plc.unary.is_valid(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNan: # TODO: copy over null mask since is_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_nan(column.obj), column.name) + return Column(plc.unary.is_nan(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNotNan: # TODO: copy over null mask since is_not_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_not_nan(column.obj), column.name) + return Column(plc.unary.is_not_nan(column.obj)) elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: (column,) = columns return self._distinct( @@ -528,7 +562,6 @@ def do_evaluate( ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: - name = columns[0].name if any(c.obj.null_count() > 0 for c in columns): raise NotImplementedError("Kleene logic for all_horizontal") return Column( @@ -539,11 +572,9 @@ def do_evaluate( output_type=self.dtype, ), (c.obj for c in columns), - ), - name, + ) ) elif self.name == pl_expr.BooleanFunction.AnyHorizontal: - name = columns[0].name if any(c.obj.null_count() > 0 for c in columns): raise NotImplementedError("Kleene logic for any_horizontal") return Column( @@ -554,8 +585,7 @@ def do_evaluate( output_type=self.dtype, ), (c.obj for c in columns), - ), - name, + ) ) elif self.name == pl_expr.BooleanFunction.IsBetween: column, lo, hi = columns @@ -571,8 +601,7 @@ def do_evaluate( ), plc.binaryop.BinaryOperator.LOGICAL_AND, self.dtype, - ), - column.name, + ) ) else: raise NotImplementedError(f"BooleanFunction {self.name}") @@ -606,7 +635,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" columns = [ @@ -615,20 +644,16 @@ def do_evaluate( ] if self.name == pl_expr.StringFunction.Lowercase: (column,) = columns - return Column(plc.strings.case.to_lower(column.obj), column.name) + return Column(plc.strings.case.to_lower(column.obj)) elif self.name == pl_expr.StringFunction.Uppercase: (column,) = columns - return Column(plc.strings.case.to_upper(column.obj), column.name) + return Column(plc.strings.case.to_upper(column.obj)) elif self.name == pl_expr.StringFunction.EndsWith: column, suffix = columns - return Column( - plc.strings.find.ends_with(column.obj, suffix.obj), column.name - ) + return Column(plc.strings.find.ends_with(column.obj, suffix.obj)) elif self.name == pl_expr.StringFunction.StartsWith: column, suffix = columns - return Column( - plc.strings.find.starts_with(column.obj, suffix.obj), column.name - ) + return Column(plc.strings.find.starts_with(column.obj, suffix.obj)) else: raise NotImplementedError(f"StringFunction {self.name}") @@ -649,19 +674,22 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" (child,) = self.children column = child.evaluate(df, context=context, mapping=mapping) (stable, nulls_last, descending) = self.options order, null_order = sorting.sort_order( - [descending], nulls_last=nulls_last, num_keys=1 + [descending], nulls_last=[nulls_last], num_keys=1 ) do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort table = do_sort(plc.Table([column.obj]), order, null_order) - return Column(table.columns()[0], column.name).set_sorted( - is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0] + return Column( + table.columns()[0], + is_sorted=plc.types.Sorted.YES, + order=order[0], + null_order=null_order[0], ) @@ -672,7 +700,7 @@ class SortBy(Expr): def __init__( self, dtype: plc.DataType, - options: tuple[bool, bool, tuple[bool]], + options: tuple[bool, tuple[bool], tuple[bool]], column: Expr, *by: Expr, ): @@ -685,7 +713,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" column, *by = ( @@ -700,7 +728,7 @@ def do_evaluate( table = do_sort( plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order ) - return Column(table.columns()[0], column.name) + return Column(table.columns()[0]) class Gather(Expr): @@ -716,7 +744,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" values, indices = ( @@ -741,7 +769,7 @@ def do_evaluate( bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK obj = indices.obj table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) - return Column(table.columns()[0], values.name) + return Column(table.columns()[0]) class Filter(Expr): @@ -757,7 +785,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" values, mask = ( @@ -767,7 +795,7 @@ def do_evaluate( table = plc.stream_compaction.apply_boolean_mask( plc.Table([values.obj]), mask.obj ) - return Column(table.columns()[0], values.name).sorted_like(values) + return Column(table.columns()[0]).sorted_like(values) class RollingWindow(Expr): @@ -803,14 +831,12 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" (child,) = self.children column = child.evaluate(df, context=context, mapping=mapping) - return Column(plc.unary.cast(column.obj, self.dtype), column.name).sorted_like( - column - ) + return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" @@ -907,7 +933,9 @@ def _reduce( plc.reduce.reduce(column.obj, request, self.dtype), 1, ), - column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, ) def _count(self, column: Column) -> Column: @@ -921,7 +949,9 @@ def _count(self, column: Column) -> Column: ), 1, ), - column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, ) def _min(self, column: Column, *, propagate_nans: bool) -> Column: @@ -933,7 +963,9 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column: ), 1, ), - column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, ) if column.nan_count > 0: column = column.mask_nans() @@ -948,25 +980,37 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column: ), 1, ), - column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, ) if column.nan_count > 0: column = column.mask_nans() return self._reduce(column, request=plc.aggregation.max()) def _first(self, column: Column) -> Column: - return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name) + return Column( + plc.copying.slice(column.obj, [0, 1])[0], + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, + ) def _last(self, column: Column) -> Column: n = column.obj.size() - return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name) + return Column( + plc.copying.slice(column.obj, [n - 1, n])[0], + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, + ) def do_evaluate( self, df, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" if context is not ExecutionContext.FRAME: @@ -1018,7 +1062,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" left, right = ( @@ -1027,7 +1071,6 @@ def do_evaluate( ) return Column( plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype), - "what", ) def collect_agg(self, *, depth: int) -> AggInfo: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index f8441b793b5..0a72cbd9f83 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -30,7 +30,7 @@ import cudf._lib.pylibcudf as plc import cudf_polars.dsl.expr as expr -from cudf_polars.containers import Column, DataFrame +from cudf_polars.containers import DataFrame, NamedColumn from cudf_polars.utils import sorting if TYPE_CHECKING: @@ -59,6 +59,38 @@ ] +def broadcast( + *columns: NamedColumn, target_length: int | None = None +) -> list[NamedColumn]: + lengths = {column.obj.size() for column in columns} + if len(lengths - {1}) > 1: + raise RuntimeError("Mismatching column lengths") + if lengths == {1}: + if target_length is None: + return list(columns) + nrows = target_length + elif len(lengths) == 1: + if target_length is not None: + assert target_length in lengths + return list(columns) + else: + (nrows,) = lengths - {1} + if target_length is not None: + assert target_length == nrows + return [ + column + if column.obj.size() != 1 + else NamedColumn( + plc.Column.from_scalar(plc.copying.get_element(column.obj, 0), nrows), + column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, + ) + for column in columns + ] + + @dataclass(slots=True) class IR: """Abstract plan node, representing an unevaluated dataframe.""" @@ -83,9 +115,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: Raises ------ - NotImplementedError if we couldn't evaluate things. Ideally - this should not occur, since the translation phase should pick - up things that we cannot handle. + NotImplementedError + If we couldn't evaluate things. Ideally this should not occur, + since the translation phase should pick up things that we + cannot handle. """ raise NotImplementedError @@ -96,7 +129,7 @@ class PythonScan(IR): options: Any """Arbitrary options.""" - predicate: expr.Expr | None + predicate: expr.NamedExpr | None """Filter to apply to the constructed dataframe before returning it.""" @@ -117,7 +150,7 @@ class Scan(IR): - ``row_index: tuple[name, offset] | None``: Add an integer index column with given name. """ - predicate: expr.Expr | None + predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" def __post_init__(self): @@ -153,14 +186,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: init = plc.interop.from_arrow( pa.scalar(offset, type=plc.interop.to_arrow(dtype)) ) - index = Column( - plc.filling.sequence(df.num_rows, init, step), name - ).set_sorted( + index = NamedColumn( + plc.filling.sequence(df.num_rows, init, step), + name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, ) - df = DataFrame([index, *df.columns], []) + df = DataFrame([index, *df.columns]) # TODO: should be true, but not the case until we get # cudf-classic out of the loop for IO since it converts date32 # to datetime. @@ -171,7 +204,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: if self.predicate is None: return df else: - mask = self.predicate.evaluate(df) + (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows) return df.filter(mask) @@ -208,7 +241,7 @@ class DataFrameScan(IR): """Polars LazyFrame object.""" projection: list[str] """List of columns to project out.""" - predicate: expr.Expr | None + predicate: expr.NamedExpr | None """Mask to apply.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @@ -231,7 +264,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values()) ) if self.predicate is not None: - mask = self.predicate.evaluate(df) + (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows) return df.filter(mask) else: return df @@ -243,20 +276,15 @@ class Select(IR): df: IR """Input dataframe.""" - cse: list[expr.Expr] - """ - List of common subexpressions that will appear in the selected expressions. - - These must be evaluated before the returned expressions. - """ - expr: list[expr.Expr] + expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - df = df.with_columns([e.evaluate(df) for e in self.cse]) - return DataFrame([e.evaluate(df) for e in self.expr], []) + # Handle any broadcasting + columns = broadcast(*(e.evaluate(df) for e in self.expr)) + return DataFrame(columns) @dataclass(slots=True) @@ -269,13 +297,15 @@ class Reduce(IR): df: IR """Input dataframe.""" - expr: list[expr.Expr] + expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - return DataFrame([e.evaluate(df) for e in self.expr], []) + columns = broadcast(*(e.evaluate(df) for e in self.expr)) + assert all(column.obj.size() == 1 for column in columns) + return DataFrame(columns) def placeholder_column(n: int): @@ -314,9 +344,9 @@ class GroupBy(IR): df: IR """Input dataframe.""" - agg_requests: list[expr.Expr] + agg_requests: list[expr.NamedExpr] """List of expressions to evaluate groupwise.""" - keys: list[expr.Expr] + keys: list[expr.NamedExpr] """List of expressions forming the keys.""" maintain_order: bool """Should the order of the input dataframe be maintained?""" @@ -339,9 +369,10 @@ def check_agg(agg: expr.Expr) -> int: Raises ------ - NotImplementedError for unsupported expression nodes. + NotImplementedError + For unsupported expression nodes. """ - if isinstance(agg, (expr.NamedExpr, expr.BinOp, expr.Cast)): + if isinstance(agg, (expr.BinOp, expr.Cast)): return max(GroupBy.check_agg(child) for child in agg.children) elif isinstance(agg, expr.Agg): if agg.name == "implode": @@ -358,14 +389,16 @@ def __post_init__(self): raise NotImplementedError("Maintaining order in groupby") if self.options.rolling: raise NotImplementedError("rolling window/groupby") - if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests): + if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - keys = [k.evaluate(df) for k in self.keys] + keys = broadcast( + *(k.evaluate(df) for k in self.keys), target_length=df.num_rows + ) # TODO: use sorted information, need to expose column_order # and null_precedence in pylibcudf groupby constructor # sorted = ( @@ -379,7 +412,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) # TODO: uniquify requests = [] - replacements = [] + replacements: list[expr.Expr] = [] for info in self.agg_infos: for pre_eval, req, rep in info.requests: if pre_eval is None: @@ -389,17 +422,20 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: requests.append(plc.groupby.GroupByRequest(col, [req])) replacements.append(rep) group_keys, raw_tables = grouper.aggregate(requests) - raw_columns = [] + # TODO: names + raw_columns: list[NamedColumn] = [] for i, table in enumerate(raw_tables): (column,) = table.columns() - raw_columns.append(Column(column, f"column{i}")) + raw_columns.append(NamedColumn(column, f"tmp{i}")) mapping = dict(zip(replacements, raw_columns)) - result_keys = [Column(gk, k.name) for gk, k in zip(group_keys.columns(), keys)] - result_subs = DataFrame(raw_columns, []) + result_keys = [ + NamedColumn(gk, k.name) for gk, k in zip(group_keys.columns(), keys) + ] + result_subs = DataFrame(raw_columns) results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] - return DataFrame([*result_keys, *results], []).slice(self.options.slice) + return DataFrame([*result_keys, *results]).slice(self.options.slice) @dataclass(slots=True) @@ -410,9 +446,9 @@ class Join(IR): """Left frame.""" right: IR """Right frame.""" - left_on: list[expr.Expr] + left_on: list[expr.NamedExpr] """List of expressions used as keys in the left frame.""" - right_on: list[expr.Expr] + right_on: list[expr.NamedExpr] """List of expressions used as keys in the right frame.""" options: tuple[ Literal["inner", "left", "full", "leftsemi", "leftanti"], @@ -479,8 +515,17 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left = self.left.evaluate(cache=cache) right = self.right.evaluate(cache=cache) - left_on = DataFrame([e.evaluate(left) for e in self.left_on], []) - right_on = DataFrame([e.evaluate(right) for e in self.right_on], []) + left_on = DataFrame( + broadcast( + *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows + ) + ) + right_on = DataFrame( + broadcast( + *(e.evaluate(right) for e in self.right_on), + target_length=right.num_rows, + ) + ) how, join_nulls, zlice, suffix, coalesce = self.options null_equality = ( plc.types.NullEquality.EQUAL @@ -510,7 +555,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: if coalesce and how != "inner": left = left.replace_columns( *( - Column( + NamedColumn( plc.replace.replace_nulls(left_col.obj, right_col.obj), left_col.name, ) @@ -538,20 +583,18 @@ class HStack(IR): df: IR """Input dataframe.""" - cse: list[expr.Expr] - """ - List of common subexpressions that will appear in the selected expressions. - - These must be evaluated before the returned expressions. - """ - columns: list[expr.Expr] + columns: list[expr.NamedExpr] """List of expressions to produce new columns.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - ctx = df.copy().with_columns([e.evaluate(df) for e in self.cse]) - return df.with_columns([c.evaluate(ctx) for c in self.columns]) + columns = [c.evaluate(df) for c in self.columns] + # TODO: a bit of a hack, should inherit the should_broadcast + # property of polars' ProjectionOptions on the hstack node. + if not any(e.name.startswith("__POLARS_CSER_0x") for e in self.columns): + columns = broadcast(*columns, target_length=df.num_rows) + return df.with_columns(columns) @dataclass(slots=True) @@ -614,7 +657,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: plc.types.NanEquality.ALL_EQUAL, ) result = DataFrame( - [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], [] + [ + NamedColumn(c, old.name).sorted_like(old) + for c, old in zip(table.columns(), df.columns) + ] ) if keys_sorted or self.stable: result = result.sorted_like(df) @@ -627,7 +673,7 @@ class Sort(IR): df: IR """Input.""" - by: list[expr.Expr] + by: list[expr.NamedExpr] """List of expressions to produce sort keys.""" do_sort: Callable[..., plc.Table] """pylibcudf sorting function.""" @@ -642,7 +688,7 @@ def __init__( self, schema: dict, df: IR, - by: list[expr.Expr], + by: list[expr.NamedExpr], options: Any, zlice: tuple[int, int] | None, ): @@ -661,7 +707,9 @@ def __init__( def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - sort_keys = [k.evaluate(df) for k in self.by] + sort_keys = broadcast( + *(k.evaluate(df) for k in self.by), target_length=df.num_rows + ) names = {c.name: i for i, c in enumerate(df.columns)} # TODO: More robust identification here. keys_in_result = [ @@ -675,7 +723,9 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: self.order, self.null_order, ) - columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)] + columns = [ + NamedColumn(c, old.name) for c, old in zip(table.columns(), df.columns) + ] # If a sort key is in the result table, set the sortedness property for k, i in enumerate(keys_in_result): columns[i] = columns[i].set_sorted( @@ -683,7 +733,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: order=self.order[k], null_order=self.null_order[k], ) - return DataFrame(columns, []).slice(self.zlice) + return DataFrame(columns).slice(self.zlice) @dataclass(slots=True) @@ -709,13 +759,14 @@ class Filter(IR): df: IR """Input.""" - mask: expr.Expr + mask: expr.NamedExpr """Expression evaluating to a mask.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - return df.filter(self.mask.evaluate(df)) + (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows) + return df.filter(mask) @dataclass(slots=True) @@ -729,7 +780,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) # This can reorder things. - return df.select(list(self.schema.keys())) + columns = broadcast( + *df.select(list(self.schema.keys())).columns, target_length=df.num_rows + ) + return DataFrame(columns) @dataclass(slots=True) @@ -856,10 +910,8 @@ class HConcat(IR): def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] - columns, scalars = zip(*((df.columns, df.scalars) for df in dfs)) return DataFrame( - list(itertools.chain.from_iterable(columns)), - list(itertools.chain.from_iterable(scalars)), + list(itertools.chain.from_iterable(df.columns for df in dfs)), ) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 9a301164beb..641176daff4 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -18,11 +18,25 @@ from cudf_polars.dsl import expr, ir from cudf_polars.utils import dtypes -__all__ = ["translate_ir", "translate_expr"] +__all__ = ["translate_ir", "translate_named_expr"] class set_node(AbstractContextManager): - """Run a block with current node set in the visitor.""" + """ + Run a block with current node set in the visitor. + + Parameters + ---------- + visitor + The internal Rust visitor object + n + The node to set as the current root. + + Notes + ----- + This is useful for translating expressions with a given node + active, restoring the node when the block exits. + """ __slots__ = ("n", "visitor") @@ -52,7 +66,7 @@ def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> return ir.PythonScan( schema, node.options, - translate_expr(visitor, n=node.predicate) + translate_named_expr(visitor, n=node.predicate) if node.predicate is not None else None, ) @@ -65,7 +79,7 @@ def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: node.scan_type, node.paths, node.file_options, - translate_expr(visitor, n=node.predicate) + translate_named_expr(visitor, n=node.predicate) if node.predicate is not None else None, ) @@ -84,7 +98,7 @@ def _( schema, node.df, node.projection, - translate_expr(visitor, n=node.selection) + translate_named_expr(visitor, n=node.selection) if node.selection is not None else None, ) @@ -94,17 +108,16 @@ def _( def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] - exprs = [translate_expr(visitor, n=e) for e in node.expr] - return ir.Select(schema, inp, cse_exprs, exprs) + exprs = [translate_named_expr(visitor, n=e) for e in node.expr] + return ir.Select(schema, inp, exprs) @_translate_ir.register def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - aggs = [translate_expr(visitor, n=e) for e in node.aggs] - keys = [translate_expr(visitor, n=e) for e in node.keys] + aggs = [translate_named_expr(visitor, n=e) for e in node.aggs] + keys = [translate_named_expr(visitor, n=e) for e in node.keys] return ir.GroupBy( schema, inp, @@ -122,10 +135,10 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: # input active. with set_node(visitor, node.input_left): inp_left = translate_ir(visitor, n=None) - left_on = [translate_expr(visitor, n=e) for e in node.left_on] + left_on = [translate_named_expr(visitor, n=e) for e in node.left_on] with set_node(visitor, node.input_right): inp_right = translate_ir(visitor, n=None) - right_on = [translate_expr(visitor, n=e) for e in node.right_on] + right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) @@ -133,16 +146,15 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs] - exprs = [translate_expr(visitor, n=e) for e in node.exprs] - return ir.HStack(schema, inp, cse_exprs, exprs) + exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] + return ir.HStack(schema, inp, exprs) @_translate_ir.register def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - exprs = [translate_expr(visitor, n=e) for e in node.expr] + exprs = [translate_named_expr(visitor, n=e) for e in node.expr] return ir.Reduce(schema, inp, exprs) @@ -159,7 +171,7 @@ def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - by = [translate_expr(visitor, n=e) for e in node.by_column] + by = [translate_named_expr(visitor, n=e) for e in node.by_column] return ir.Sort(schema, inp, by, node.sort_options, node.slice) @@ -172,7 +184,7 @@ def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - mask = translate_expr(visitor, n=node.predicate) + mask = translate_named_expr(visitor, n=node.predicate) return ir.Filter(schema, inp, mask) @@ -234,8 +246,8 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: Raises ------ - NotImplementedError if we can't translate the nodes due to - unsupported functionality. + NotImplementedError + If we can't translate the nodes due to unsupported functionality. """ ctx: AbstractContextManager = ( set_node(visitor, n) if n is not None else noop_context @@ -246,17 +258,41 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: return _translate_ir(node, visitor, schema) +def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr: + """ + Translate a polars-internal named expression IR object into our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Node to translate, a named expression node. + + Returns + ------- + Translated IR object. + + Notes + ----- + The datatype of the internal expression will be obtained from the + visitor by calling ``get_dtype``, for this to work properly, the + caller should arrange that the expression is translated with the + node that it references "active" for the visitor (see :class:`set_node`). + + Raises + ------ + NotImplementedError + If any translation fails due to unsupported functionality. + """ + return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node)) + + @singledispatch def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr: raise NotImplementedError(f"Translation for {type(node).__name__}") -@_translate_expr.register -def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr: - e = translate_expr(visitor, n=node.node) - return expr.NamedExpr(dtype, node.output_name, e) - - @_translate_expr.register def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data @@ -375,7 +411,7 @@ def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr: return expr.Len(dtype) -def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: +def translate_expr(visitor: Any, *, n: int) -> expr.Expr: """ Translate a polars-internal expression IR into our representation. @@ -384,8 +420,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: visitor Polars NodeTraverser object n - Node to translate, either an integer referencing a polars - internal node, or a named expression node. + Node to translate, an integer referencing a polars internal node. Returns ------- @@ -393,14 +428,9 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: Raises ------ - NotImplementedError if any translation fails due to unsupported functionality. + NotImplementedError + If any translation fails due to unsupported functionality. """ - if isinstance(n, pl_expr.PyExprIR): - # TODO: type narrowing doesn't rule out int since PyExprIR is Unknown - assert not isinstance(n, int) - node = n - dtype = dtypes.from_polars(visitor.get_dtype(node.node)) - else: - node = visitor.view_expression(n) - dtype = dtypes.from_polars(visitor.get_dtype(n)) + node = visitor.view_expression(n) + dtype = dtypes.from_polars(visitor.get_dtype(n)) return _translate_expr(node, visitor, dtype) diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index a6e26a6425c..2fbfa971fef 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -23,7 +23,7 @@ def assert_gpu_result_equal( *, check_row_order: bool = True, check_column_order: bool = True, - check_dtype: bool = True, + check_dtypes: bool = True, check_exact: bool = True, rtol: float = 1e-05, atol: float = 1e-08, @@ -40,7 +40,7 @@ def assert_gpu_result_equal( Expect rows to be in same order check_column_order Expect columns to be in same order - check_dtype + check_dtypes Expect dtypes to match check_exact Require exact equality for floats, if `False` compare using @@ -68,7 +68,7 @@ def assert_gpu_result_equal( got, check_row_order=check_row_order, check_column_order=check_column_order, - check_dtype=check_dtype, + check_dtypes=check_dtypes, check_exact=check_exact, rtol=rtol, atol=atol, diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index bede0de3c9f..7b0049daf11 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -32,7 +32,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: Raises ------ - NotImplementedError for unsupported conversions. + NotImplementedError + For unsupported conversions. """ if isinstance(dtype, pl.Boolean): return plc.DataType(plc.TypeId.BOOL8) diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py index b3ecfdd3dd4..d35459db20d 100644 --- a/python/cudf_polars/cudf_polars/utils/sorting.py +++ b/python/cudf_polars/cudf_polars/utils/sorting.py @@ -14,7 +14,7 @@ def sort_order( - descending: Sequence[bool], *, nulls_last: bool, num_keys: int + descending: Sequence[bool], *, nulls_last: Sequence[bool], num_keys: int ) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: """ Produce sort order arguments. @@ -36,14 +36,18 @@ def sort_order( # Mimicking polars broadcast handling of descending if num_keys > (n := len(descending)) and n == 1: descending = [descending[0]] * num_keys + if num_keys > (n := len(nulls_last)) and n == 1: + nulls_last = [nulls_last[0]] * num_keys column_order = [ plc.types.Order.DESCENDING if d else plc.types.Order.ASCENDING for d in descending ] null_precedence = [] - for asc in column_order: - if (asc == plc.types.Order.ASCENDING) ^ (not nulls_last): + # TODO: use strict=True when we drop py39 + assert len(descending) == len(nulls_last) + for asc, null_last in zip(column_order, nulls_last): + if (asc == plc.types.Order.ASCENDING) ^ (not null_last): null_precedence.append(plc.types.NullOrder.AFTER) - elif (asc == plc.types.Order.ASCENDING) ^ nulls_last: + elif (asc == plc.types.Order.ASCENDING) ^ null_last: null_precedence.append(plc.types.NullOrder.BEFORE) return column_order, null_precedence diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index cbf012f5881..b50d01c26db 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -34,6 +34,8 @@ pip install --upgrade uv uv pip install --upgrade -r py-polars/requirements-dev.txt ``` +> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster! + Now we have the necessary machinery to build polars ```sh cd py-polars @@ -57,7 +59,7 @@ The executor for the polars logical plan lives in the cudf repo, in ```sh cd cudf/python/cudf_polars -pip install --no-deps -e . +uv pip install --no-build-isolation --no-deps -e . ``` You should now be able to run the tests in the `cudf_polars` package: @@ -96,6 +98,21 @@ This should either transparently run on the GPU and deliver a polars dataframe, or else fail (but be handled) and just run the normal CPU execution. +If you want to fail during translation, set the keyword argument +`raise_on_fail` to `True`: + +```python +from functools import partial +from cudf_polars.callback import execute_with_cudf + +result = q.collect( + post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) +) +``` + +This is mostly useful when writing tests, since in that case we want +any failures to propagate, rather than falling back to the CPU mode. + ## Adding a handler for a new plan node Plan node definitions live in `cudf_polars/dsl/ir.py`, these are @@ -153,22 +170,84 @@ the logical plan in any case, so is reasonably natural. # Containers Containers should be constructed as relatively lightweight objects -around their pylibcudf counterparts. We have three (in +around their pylibcudf counterparts. We have four (in `cudf_polars/containers/`): -1. Scalar (a wrapper around a pylibcudf Scalar) -2. Column (a wrapper around a pylibcudf Column) -3. DataFrame (a wrapper around a pylibcudf Table) +1. `Scalar` (a wrapper around a pylibcudf `Scalar`) +2. `Column` (a wrapper around a pylibcudf `Column`) +3. `NamedColumn` a `Column` with an additional name +4. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly -speaking, a `DataFrame` is just a list of `Column`s which each hold -data plus a string `name`, along with a collection of `Scalar`s (this -might go away). +speaking, a `DataFrame` is just a list of `NamedColumn`s which each +hold a `Column` plus a string `name`. `NamedColumn`s are only ever +constructed via `NamedExpr`s, which are the top-level expression node +that lives inside an `IR` node. This means that the expression +evaluator never has to concern itself with column names: columns are +only ever decorated with names when constructing a `DataFrame`. The columns keep track of metadata (for example, whether or not they -are sorted). +are sorted). We could imagine tracking more metadata, like minimum and +maximum, though perhaps that is better left to libcudf itself. We offer some utility methods for transferring metadata when constructing new dataframes and columns, both `DataFrame` and `Column` -offer a `with_metadata(*, like: Self)` call which copies metadata from -the template. +offer a `sorted_like(like: Self)` call which copies metadata from the +template. + +All methods on containers that modify in place should return `self`, +to facilitate use in a ["fluent" +style](https://en.wikipedia.org/wiki/Fluent_interface). It makes it +much easier to write iteration over objects and collect the results if +everyone always returns a value. + +# Writing tests + +We use `pytest`, tests live in the `tests/` subdirectory, +organisationally the top-level test files each handle one of the `IR` +nodes. The goal is that they are parametrized over all the options +each node will handle, to have reasonable coverage. Tests of +expression functionality should live in `tests/expressions/`. + +To write a test an assert correctness, build a lazyframe as a query, +and then use the utility assertion function from +`cudf_polars.testing.asserts`. This runs the query using both the cudf +executor and polars CPU, and checks that they match. So: + +```python +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_whatever(): + query = pl.LazyFrame(...).(...) + + assert_gpu_result_equal(query) +``` + +# Debugging + +If the callback execution fails during the polars `collect` call, we +obtain an error, but are not able to drop into the debugger and +inspect the stack properly: we can't cross the language barrier. + +However, we can drive the translation and execution of the DSL by +hand. Given some `LazyFrame` representing a query, we can first +translate it to our intermediate representation (IR), and then execute +and convert back to polars: + +```python +from cudf_polars.dsl.translate import translate_ir + +q = ... + +# Convert to our IR +ir = translate_ir(q._ldf.visit()) + +# DataFrame living on the device +result = ir.evaluate(cache={}) + +# Polars dataframe +host_result = result.to_polars() +``` + +If we get any exceptions, we can then debug as normal in Python. diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 645dbd26140..79018c80bf3 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -56,8 +56,8 @@ def test_agg(df, agg): q = df.select(expr) # https://github.com/rapidsai/cudf/issues/15852 - check_dtype = agg not in {"n_unique", "median"} - if not check_dtype and q.schema["a"] != pl.Float64: + check_dtypes = agg not in {"n_unique", "median"} + if not check_dtypes and q.schema["a"] != pl.Float64: with pytest.raises(AssertionError): assert_gpu_result_equal(q) - assert_gpu_result_equal(q, check_dtype=check_dtype, check_exact=False) + assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py index 503edef152e..037f3ab5428 100644 --- a/python/cudf_polars/tests/test_select.py +++ b/python/cudf_polars/tests/test_select.py @@ -36,3 +36,24 @@ def test_select_reduce(): ) assert_gpu_result_equal(query) + + +def test_select_with_cse_no_agg(): + df = pl.LazyFrame({"a": [1, 2, 3]}) + expr = pl.col("a") + pl.col("a") + + query = df.select(expr, (expr * 2).alias("b"), ((expr * 2) + 10).alias("c")) + + assert_gpu_result_equal(query) + + +def test_select_with_cse_with_agg(): + df = pl.LazyFrame({"a": [1, 2, 3]}) + expr = pl.col("a") + pl.col("a") + asum = pl.col("a").sum() + pl.col("a").sum() + + query = df.select( + expr, (expr * 2).alias("b"), asum.alias("c"), (asum + 10).alias("d") + ) + + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index 2c85bb15a55..18cf4748692 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -2,14 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import pytest - import polars as pl from cudf_polars.testing.asserts import assert_gpu_result_equal -@pytest.mark.xfail(reason="Need handling of null scalars that are cast") def test_union(): ldf = pl.DataFrame( { @@ -19,8 +16,6 @@ def test_union(): ).lazy() ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a")) query = pl.concat([ldf, ldf2], how="diagonal") - # Plan for this produces a `None`.astype(Int64) which we don't - # handle correctly right now assert_gpu_result_equal(query) From 66895af970c19978e12c242f92f5b5676d91b9e3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 6 Jun 2024 11:12:15 -0500 Subject: [PATCH 053/340] Implement chunked parquet reader in cudf-python (#15728) Partially Addresses: #14966 This PR implements chunked parquet bindings in python. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/15728 --- python/cudf/cudf/_lib/parquet.pyx | 242 +++++++++++++----- .../_lib/pylibcudf/libcudf/io/parquet.pxd | 12 + python/cudf/cudf/tests/test_parquet.py | 27 ++ 3 files changed, 220 insertions(+), 61 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index ac592cedaac..f6f9cfa9a7c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -26,6 +26,7 @@ from libc.stdint cimport uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr +from libcpp.pair cimport pair from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move @@ -44,6 +45,7 @@ from cudf._lib.io.utils cimport ( ) from cudf._lib.pylibcudf.libcudf.expressions cimport expression from cudf._lib.pylibcudf.libcudf.io.parquet cimport ( + chunked_parquet_reader as cpp_chunked_parquet_reader, chunked_parquet_writer_options, merge_row_group_metadata as parquet_merge_metadata, parquet_chunked_writer as cpp_parquet_chunked_writer, @@ -60,6 +62,7 @@ from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport ( from cudf._lib.pylibcudf.libcudf.io.types cimport ( column_in_metadata, table_input_metadata, + table_metadata, ) from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type @@ -126,50 +129,22 @@ def _parse_metadata(meta): return file_is_range_index, file_index_cols, file_column_dtype -cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, - use_pandas_metadata=True, - Expression filters=None): - """ - Cython function to call into libcudf API, see `read_parquet`. - - filters, if not None, should be an Expression that evaluates to a - boolean predicate as a function of columns being read. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - - # Convert NativeFile buffers to NativeFileDatasource, - # but save original buffers in case we need to use - # pyarrow for metadata processing - # (See: https://github.com/rapidsai/cudf/issues/9599) - pa_buffers = [] - for i, datasource in enumerate(filepaths_or_buffers): - if isinstance(datasource, NativeFile): - pa_buffers.append(datasource) - filepaths_or_buffers[i] = NativeFileDatasource(datasource) +cdef pair[parquet_reader_options, bool] _setup_parquet_reader_options( + cudf_io_types.source_info source, + vector[vector[size_type]] row_groups, + bool use_pandas_metadata, + Expression filters, + object columns): - cdef cudf_io_types.source_info source = make_source_info( - filepaths_or_buffers) - - cdef bool cpp_use_pandas_metadata = use_pandas_metadata - - cdef vector[vector[size_type]] cpp_row_groups + cdef parquet_reader_options args + cdef parquet_reader_options_builder builder cdef data_type cpp_timestamp_type = cudf_types.data_type( cudf_types.type_id.EMPTY ) - if row_groups is not None: - cpp_row_groups = row_groups - - # Setup parquet reader arguments - cdef parquet_reader_options args - cdef parquet_reader_options_builder builder builder = ( parquet_reader_options.builder(source) - .row_groups(cpp_row_groups) - .use_pandas_metadata(cpp_use_pandas_metadata) + .row_groups(row_groups) + .use_pandas_metadata(use_pandas_metadata) .use_arrow_schema(True) .timestamp_type(cpp_timestamp_type) ) @@ -185,28 +160,28 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for col in columns: cpp_columns.push_back(str(col).encode()) args.set_columns(cpp_columns) - # Filters don't handle the range index correctly allow_range_index &= filters is None - # Read Parquet - cdef cudf_io_types.table_with_metadata c_result - - with nogil: - c_result = move(parquet_reader(args)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] - - # Access the Parquet per_file_user_data to find the index + return pair[parquet_reader_options, bool](args, allow_range_index) + +cdef object _process_metadata(object df, + table_metadata table_meta, + list names, + object row_groups, + object filepaths_or_buffers, + list pa_buffers, + bool allow_range_index, + bool use_pandas_metadata): + update_struct_field_names(df, table_meta.schema_info) index_col = None - cdef vector[unordered_map[string, string]] per_file_user_data = \ - c_result.metadata.per_file_user_data - + is_range_index = True column_index_type = None index_col_names = None - is_range_index = True + meta = None + cdef vector[unordered_map[string, string]] per_file_user_data = \ + table_meta.per_file_user_data for single_file in per_file_user_data: json_str = single_file[b'pandas'].decode('utf-8') - meta = None if json_str != "": meta = json.loads(json_str) file_is_range_index, index_col, column_index_type = _parse_metadata(meta) @@ -220,13 +195,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if c['field_name'] == idx_col: index_col_names[idx_col] = c['name'] - df = cudf.DataFrame._from_data(*data_from_unique_ptr( - move(c_result.tbl), - column_names=names - )) - - update_struct_field_names(df, c_result.metadata.schema_info) - if meta is not None: # Book keep each column metadata as the order # of `meta["columns"]` and `column_names` are not @@ -319,9 +287,65 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if use_pandas_metadata: df.index.names = index_col - # Set column dtype for empty types. if len(df._data.names) == 0 and column_index_type is not None: df._data.label_dtype = cudf.dtype(column_index_type) + + return df + + +cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, + use_pandas_metadata=True, + Expression filters=None): + """ + Cython function to call into libcudf API, see `read_parquet`. + + filters, if not None, should be an Expression that evaluates to a + boolean predicate as a function of columns being read. + + See Also + -------- + cudf.io.parquet.read_parquet + cudf.io.parquet.to_parquet + """ + + # Convert NativeFile buffers to NativeFileDatasource, + # but save original buffers in case we need to use + # pyarrow for metadata processing + # (See: https://github.com/rapidsai/cudf/issues/9599) + pa_buffers = [] + for i, datasource in enumerate(filepaths_or_buffers): + if isinstance(datasource, NativeFile): + pa_buffers.append(datasource) + filepaths_or_buffers[i] = NativeFileDatasource(datasource) + + cdef cudf_io_types.source_info source = make_source_info( + filepaths_or_buffers) + + cdef vector[vector[size_type]] cpp_row_groups + if row_groups is not None: + cpp_row_groups = row_groups + + # Setup parquet reader arguments + cdef parquet_reader_options args + cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options( + source, cpp_row_groups, use_pandas_metadata, filters, columns) + args, allow_range_index = c_res.first, c_res.second + + # Read Parquet + cdef cudf_io_types.table_with_metadata c_result + + with nogil: + c_result = move(parquet_reader(args)) + + names = [info.name.decode() for info in c_result.metadata.schema_info] + + df = cudf.DataFrame._from_data(*data_from_unique_ptr( + move(c_result.tbl), + column_names=names + )) + df = _process_metadata(df, c_result.metadata, names, row_groups, + filepaths_or_buffers, pa_buffers, + allow_range_index, use_pandas_metadata) return df cpdef read_parquet_metadata(filepaths_or_buffers): @@ -767,6 +791,102 @@ cdef class ParquetWriter: self.initialized = True +cdef class ParquetReader: + cdef bool initialized + cdef unique_ptr[cpp_chunked_parquet_reader] reader + cdef size_t chunk_read_limit + cdef size_t pass_read_limit + cdef size_t row_group_size_bytes + cdef table_metadata result_meta + cdef vector[unordered_map[string, string]] per_file_user_data + cdef object pandas_meta + cdef list pa_buffers + cdef bool allow_range_index + cdef object row_groups + cdef object filepaths_or_buffers + cdef object names + cdef object column_index_type + cdef object index_col_names + cdef bool is_range_index + cdef object index_col + cdef bool cpp_use_pandas_metadata + + def __cinit__(self, filepaths_or_buffers, columns=None, row_groups=None, + use_pandas_metadata=True, + size_t chunk_read_limit=0, + size_t pass_read_limit=1024000000): + + # Convert NativeFile buffers to NativeFileDatasource, + # but save original buffers in case we need to use + # pyarrow for metadata processing + # (See: https://github.com/rapidsai/cudf/issues/9599) + + pa_buffers = [] + for i, datasource in enumerate(filepaths_or_buffers): + if isinstance(datasource, NativeFile): + pa_buffers.append(datasource) + filepaths_or_buffers[i] = NativeFileDatasource(datasource) + self.pa_buffers = pa_buffers + cdef cudf_io_types.source_info source = make_source_info( + filepaths_or_buffers) + + self.cpp_use_pandas_metadata = use_pandas_metadata + + cdef vector[vector[size_type]] cpp_row_groups + if row_groups is not None: + cpp_row_groups = row_groups + cdef parquet_reader_options args + cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options( + source, cpp_row_groups, use_pandas_metadata, None, columns) + args, self.allow_range_index = c_res.first, c_res.second + + with nogil: + self.reader.reset( + new cpp_chunked_parquet_reader( + chunk_read_limit, + pass_read_limit, + args + ) + ) + self.initialized = False + self.row_groups = row_groups + self.filepaths_or_buffers = filepaths_or_buffers + + def _has_next(self): + cdef bool res + with nogil: + res = self.reader.get()[0].has_next() + return res + + def _read_chunk(self): + # Read Parquet + cdef cudf_io_types.table_with_metadata c_result + + with nogil: + c_result = move(self.reader.get()[0].read_chunk()) + + if not self.initialized: + self.names = [info.name.decode() for info in c_result.metadata.schema_info] + self.result_meta = c_result.metadata + + df = cudf.DataFrame._from_data(*data_from_unique_ptr( + move(c_result.tbl), + column_names=self.names, + )) + + self.initialized = True + return df + + def read(self): + dfs = [] + while self._has_next(): + dfs.append(self._read_chunk()) + df = cudf.concat(dfs) + df = _process_metadata(df, self.result_meta, self.names, self.row_groups, + self.filepaths_or_buffers, self.pa_buffers, + self.allow_range_index, self.cpp_use_pandas_metadata) + return df + cpdef merge_filemetadata(object filemetadata_list): """ Cython function to call into libcudf API, see `merge_row_group_metadata`. diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd index 33a594b432f..fb98650308a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd @@ -283,6 +283,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: vector[string] column_chunks_file_paths, ) except + + cdef cppclass chunked_parquet_reader: + chunked_parquet_reader() except + + chunked_parquet_reader( + size_t chunk_read_limit, + const parquet_reader_options& options) except + + chunked_parquet_reader( + size_t chunk_read_limit, + size_t pass_read_limit, + const parquet_reader_options& options) except + + bool has_next() except + + cudf_io_types.table_with_metadata read_chunk() except + + cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata( const vector[unique_ptr[vector[uint8_t]]]& metadata_list ) except + diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index e32fdacd8d6..2596fe8cd37 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -22,6 +22,7 @@ from pyarrow import fs as pa_fs, parquet as pq import cudf +from cudf._lib.parquet import ParquetReader from cudf.io.parquet import ( ParquetDatasetWriter, ParquetWriter, @@ -3407,3 +3408,29 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(): # Check results assert_eq(expected, got) + + +@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("use_pandas_metadata", [True, False]) +@pytest.mark.parametrize("row_groups", [[[0]], None, [[0, 1]]]) +def test_parquet_chunked_reader( + chunk_read_limit, pass_read_limit, use_pandas_metadata, row_groups +): + df = pd.DataFrame( + {"a": [1, 2, 3, 4] * 1000000, "b": ["av", "qw", "hi", "xyz"] * 1000000} + ) + buffer = BytesIO() + df.to_parquet(buffer) + reader = ParquetReader( + [buffer], + chunk_read_limit=chunk_read_limit, + pass_read_limit=pass_read_limit, + use_pandas_metadata=use_pandas_metadata, + row_groups=row_groups, + ) + expected = cudf.read_parquet( + buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups + ) + actual = reader.read() + assert_eq(expected, actual) From 61da92415f1449f64a4050d2dec47b29344389a9 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 6 Jun 2024 17:19:28 +0100 Subject: [PATCH 054/340] Document how to use cudf.pandas in tandem with multiprocessing (#15940) We need to arrange that cudf.pandas.install() is run on the workers, this requires that we programmatically install the metapath loader in our script. Unfortunately, passing an initializer function to the pool startup is not sufficient if any part of the script transitively loads pandas at the top level. - Closes #15246 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15940 --- docs/cudf/source/cudf_pandas/usage.md | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index b174c606d66..376784439aa 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -26,6 +26,36 @@ From the command line, run your Python scripts with `-m cudf.pandas`: python -m cudf.pandas script.py ``` +### Usage in tandem with +[`multiprocessing`](https://docs.python.org/3/library/multiprocessing.html) +or +[`concurrent.futures`](https://docs.python.org/3/library/concurrent.futures.html) +process pools + +To use a pool of workers (for example +[`multiprocessing.Pool`](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool) +or +[`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor)) +in your script with `cudf.pandas`, the `cudf.pandas` module must be +loaded on the worker processes, as well as by the controlling script. +The most foolproof way to do this is to programmatically install +`cudf.pandas` at the top of your script, before anything else. +For example + +```python +# This is equivalent to python -m cudf.pandas, but will run on the +# workers too. These two lines must run before pandas is imported, +# either directly or transitively. +import cudf.pandas +cudf.pandas.install() + +from multiprocessing import Pool + +with Pool(4) as pool: + # use pool here + ... +``` + ## Understanding performance - the `cudf.pandas` profiler `cudf.pandas` will attempt to use the GPU whenever possible and fall From 3468fa1f5b9dfcf83a95bcb09fe5a4d8d3808620 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 6 Jun 2024 19:30:48 +0100 Subject: [PATCH 055/340] Add more complete type annotations in polars interpreter (#15942) We can check this with: pyright --verifytypes cudf_polars --ignoreexternal Which reports a "type completeness" score of around 94%. This will improve once pylibcudf gets type stubs. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - James Lamb (https://github.com/jameslamb) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15942 --- .pre-commit-config.yaml | 2 +- python/cudf_polars/cudf_polars/__init__.py | 5 +- python/cudf_polars/cudf_polars/callback.py | 3 +- .../cudf_polars/containers/dataframe.py | 13 +- python/cudf_polars/cudf_polars/dsl/expr.py | 55 +++++--- python/cudf_polars/cudf_polars/dsl/ir.py | 110 +++++++-------- .../cudf_polars/cudf_polars/dsl/translate.py | 127 ++++++++++++------ python/cudf_polars/cudf_polars/py.typed | 0 .../cudf_polars/testing/asserts.py | 2 +- .../cudf_polars/typing/__init__.py | 91 +++++++++++++ python/cudf_polars/pyproject.toml | 2 - 11 files changed, 287 insertions(+), 123 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/py.typed create mode 100644 python/cudf_polars/cudf_polars/typing/__init__.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8865fb48e0d..4cdcac88091 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -134,7 +134,7 @@ repos: - id: rapids-dependency-file-generator args: ["--clean"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.4.8 hooks: - id: ruff files: python/.*$ diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 74547fe2448..b19a282129a 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -10,4 +10,7 @@ from __future__ import annotations -__all__: list[str] = [] +from cudf_polars.callback import execute_with_cudf +from cudf_polars.dsl.translate import translate_ir + +__all__: list[str] = ["execute_with_cudf", "translate_ir"] diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index aabb8498ce2..979087d5273 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -16,6 +16,7 @@ import polars as pl from cudf_polars.dsl.ir import IR + from cudf_polars.typing import NodeTraverser __all__: list[str] = ["execute_with_cudf"] @@ -33,7 +34,7 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None: +def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None: """ A post optimization callback that attempts to execute the plan with cudf. diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index eeaf181be0c..ac7e748095e 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -6,7 +6,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import polars as pl @@ -17,6 +17,7 @@ if TYPE_CHECKING: from collections.abc import Mapping, Sequence, Set + import pyarrow as pa from typing_extensions import Self import cudf @@ -44,13 +45,13 @@ def copy(self) -> Self: def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" - return pl.from_arrow( - plc.interop.to_arrow( - self.table, - [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], - ) + table: pa.Table = plc.interop.to_arrow( + self.table, + [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], ) + return cast(pl.DataFrame, pl.from_arrow(table)) + @cached_property def column_names_set(self) -> frozenset[str]: """Return the column names as a set.""" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c7c11cf6c68..6d9435ce373 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -139,14 +139,14 @@ def is_equal(self, other: Any) -> bool: other.children ) - def __eq__(self, other) -> bool: + def __eq__(self, other: Any) -> bool: """Equality of expressions.""" if type(self) != type(other) or hash(self) != hash(other): return False else: return self.is_equal(other) - def __ne__(self, other) -> bool: + def __ne__(self, other: Any) -> bool: """Inequality of expressions.""" return not self.__eq__(other) @@ -285,6 +285,8 @@ class NamedExpr: # when evaluating expressions themselves, only when constructing # named return values in dataframe (IR) nodes. __slots__ = ("name", "value") + value: Expr + name: str def __init__(self, name: str, value: Expr) -> None: self.name = name @@ -298,7 +300,7 @@ def __repr__(self) -> str: """Repr of the expression.""" return f"NamedExpr({self.name}, {self.value}" - def __eq__(self, other) -> bool: + def __eq__(self, other: Any) -> bool: """Equality of two expressions.""" return ( type(self) is type(other) @@ -306,7 +308,7 @@ def __eq__(self, other) -> bool: and self.value == other.value ) - def __ne__(self, other) -> bool: + def __ne__(self, other: Any) -> bool: """Inequality of expressions.""" return not self.__eq__(other) @@ -344,9 +346,10 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Literal(Expr): __slots__ = ("value",) _non_child = ("dtype", "value") - value: pa.Scalar + value: pa.Scalar[Any] + children: tuple[()] - def __init__(self, dtype: plc.DataType, value: pa.Scalar) -> None: + def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None: super().__init__(dtype) assert value.type == plc.interop.to_arrow(dtype) self.value = value @@ -367,6 +370,7 @@ class Col(Expr): __slots__ = ("name",) _non_child = ("dtype", "name") name: str + children: tuple[()] def __init__(self, dtype: plc.DataType, name: str) -> None: self.dtype = dtype @@ -388,6 +392,8 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Len(Expr): + children: tuple[()] + def do_evaluate( self, df: DataFrame, @@ -410,8 +416,15 @@ def collect_agg(self, *, depth: int) -> AggInfo: class BooleanFunction(Expr): __slots__ = ("name", "options", "children") _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] - def __init__(self, dtype: plc.DataType, name: str, options: tuple, *children: Expr): + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.BooleanFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: super().__init__(dtype) self.options = options self.name = name @@ -610,14 +623,15 @@ def do_evaluate( class StringFunction(Expr): __slots__ = ("name", "options", "children") _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] def __init__( self, dtype: plc.DataType, name: pl_expr.StringFunction, - options: tuple, + options: tuple[Any, ...], *children: Expr, - ): + ) -> None: super().__init__(dtype) self.options = options self.name = name @@ -661,10 +675,11 @@ def do_evaluate( class Sort(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") + children: tuple[Expr] def __init__( self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr - ): + ) -> None: super().__init__(dtype) self.options = options self.children = (column,) @@ -696,6 +711,7 @@ def do_evaluate( class SortBy(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") + children: tuple[Expr, ...] def __init__( self, @@ -703,7 +719,7 @@ def __init__( options: tuple[bool, tuple[bool], tuple[bool]], column: Expr, *by: Expr, - ): + ) -> None: super().__init__(dtype) self.options = options self.children = (column, *by) @@ -734,8 +750,9 @@ def do_evaluate( class Gather(Expr): __slots__ = ("children",) _non_child = ("dtype",) + children: tuple[Expr, Expr] - def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None: super().__init__(dtype) self.children = (values, indices) @@ -775,6 +792,7 @@ def do_evaluate( class Filter(Expr): __slots__ = ("children",) _non_child = ("dtype",) + children: tuple[Expr, Expr] def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): super().__init__(dtype) @@ -801,8 +819,9 @@ def do_evaluate( class RollingWindow(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") + children: tuple[Expr] - def __init__(self, dtype: plc.DataType, options: Any, agg: Expr): + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None: super().__init__(dtype) self.options = options self.children = (agg,) @@ -811,8 +830,9 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr): class GroupedRollingWindow(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") + children: tuple[Expr, ...] - def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr): + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None: super().__init__(dtype) self.options = options self.children = (agg, *by) @@ -821,8 +841,9 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr): class Cast(Expr): __slots__ = ("children",) _non_child = ("dtype",) + children: tuple[Expr] - def __init__(self, dtype: plc.DataType, value: Expr): + def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) @@ -848,6 +869,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Agg(Expr): __slots__ = ("name", "options", "op", "request", "children") _non_child = ("dtype", "name", "options") + children: tuple[Expr] def __init__( self, dtype: plc.DataType, name: str, options: Any, value: Expr @@ -1007,7 +1029,7 @@ def _last(self, column: Column) -> Column: def do_evaluate( self, - df, + df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, @@ -1022,6 +1044,7 @@ def do_evaluate( class BinOp(Expr): __slots__ = ("op", "children") _non_child = ("dtype", "op") + children: tuple[Expr, Expr] def __init__( self, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 0a72cbd9f83..665bbe5be41 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 -# TODO: remove need for this -# ruff: noqa: D101 """ DSL nodes for the LogicalPlan of polars. @@ -15,11 +13,11 @@ from __future__ import annotations +import dataclasses import itertools import types -from dataclasses import dataclass from functools import cache -from typing import TYPE_CHECKING, Any, Callable, ClassVar +from typing import TYPE_CHECKING, Any, Callable, ClassVar, NoReturn import pyarrow as pa from typing_extensions import assert_never @@ -34,8 +32,11 @@ from cudf_polars.utils import sorting if TYPE_CHECKING: + from collections.abc import MutableMapping from typing import Literal + from cudf_polars.typing import Schema + __all__ = [ "IR", @@ -91,14 +92,14 @@ def broadcast( ] -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class IR: """Abstract plan node, representing an unevaluated dataframe.""" - schema: dict[str, plc.DataType] + schema: Schema """Mapping from column names to their data types.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ Evaluate the node and return a dataframe. @@ -123,7 +124,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: raise NotImplementedError -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class PythonScan(IR): """Representation of input from a python function.""" @@ -133,7 +134,7 @@ class PythonScan(IR): """Filter to apply to the constructed dataframe before returning it.""" -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Scan(IR): """Input from files.""" @@ -153,14 +154,14 @@ class Scan(IR): predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" - def __post_init__(self): + def __post_init__(self) -> None: """Validate preconditions.""" if self.file_options.n_rows is not None: raise NotImplementedError("row limit in scan") if self.typ not in ("csv", "parquet"): raise NotImplementedError(f"Unhandled scan type: {self.typ}") - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" options = self.file_options with_columns = options.with_columns @@ -172,9 +173,9 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) ) elif self.typ == "parquet": - df = DataFrame.from_cudf( - cudf.read_parquet(self.paths, columns=with_columns) - ) + cdf = cudf.read_parquet(self.paths, columns=with_columns) + assert isinstance(cdf, cudf.DataFrame) + df = DataFrame.from_cudf(cdf) else: assert_never(self.typ) if row_index is not None: @@ -208,7 +209,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return df.filter(mask) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Cache(IR): """ Return a cached plan node. @@ -221,7 +222,7 @@ class Cache(IR): value: IR """The unevaluated node to cache.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" try: return cache[self.key] @@ -229,7 +230,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return cache.setdefault(self.key, self.value.evaluate(cache=cache)) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class DataFrameScan(IR): """ Input from an existing polars DataFrame. @@ -244,7 +245,7 @@ class DataFrameScan(IR): predicate: expr.NamedExpr | None """Mask to apply.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" pdf = pl.DataFrame._from_pydf(self.df) if self.projection is not None: @@ -270,7 +271,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return df -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Select(IR): """Produce a new dataframe selecting given expressions from an input.""" @@ -279,7 +280,7 @@ class Select(IR): expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" - def evaluate(self, *, cache: dict[int, DataFrame]): + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) # Handle any broadcasting @@ -287,7 +288,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]): return DataFrame(columns) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Reduce(IR): """ Produce a new dataframe selecting given expressions from an input. @@ -300,7 +301,7 @@ class Reduce(IR): expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" - def evaluate(self, *, cache: dict[int, DataFrame]): + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) columns = broadcast(*(e.evaluate(df) for e in self.expr)) @@ -308,7 +309,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]): return DataFrame(columns) -def placeholder_column(n: int): +def placeholder_column(n: int) -> plc.Column: """ Produce a placeholder pylibcudf column with NO BACKING DATA. @@ -338,7 +339,7 @@ def placeholder_column(n: int): ) -@dataclass(slots=False) +@dataclasses.dataclass(slots=False) class GroupBy(IR): """Perform a groupby.""" @@ -352,6 +353,7 @@ class GroupBy(IR): """Should the order of the input dataframe be maintained?""" options: Any """Options controlling style of groupby.""" + agg_infos: list[expr.AggInfo] = dataclasses.field(init=False) @staticmethod def check_agg(agg: expr.Expr) -> int: @@ -383,7 +385,7 @@ def check_agg(agg: expr.Expr) -> int: else: raise NotImplementedError(f"No handler for {agg=}") - def __post_init__(self): + def __post_init__(self) -> None: """Check whether all the aggregations are implemented.""" if self.options.rolling is None and self.maintain_order: raise NotImplementedError("Maintaining order in groupby") @@ -393,7 +395,7 @@ def __post_init__(self): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) keys = broadcast( @@ -438,7 +440,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return DataFrame([*result_keys, *results]).slice(self.options.slice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Join(IR): """A join of two dataframes.""" @@ -466,7 +468,7 @@ class Join(IR): - coalesce: should key columns be coalesced (only makes sense for outer joins) """ - def __post_init__(self): + def __post_init__(self) -> None: """Validate preconditions.""" if self.options[0] == "cross": raise NotImplementedError("cross join not implemented") @@ -511,7 +513,7 @@ def _joiners( else: assert_never(how) - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left = self.left.evaluate(cache=cache) right = self.right.evaluate(cache=cache) @@ -577,7 +579,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return result.slice(zlice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class HStack(IR): """Add new columns to a dataframe.""" @@ -586,7 +588,7 @@ class HStack(IR): columns: list[expr.NamedExpr] """List of expressions to produce new columns.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) columns = [c.evaluate(df) for c in self.columns] @@ -597,7 +599,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return df.with_columns(columns) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Distinct(IR): """Produce a new dataframe with distinct rows.""" @@ -619,7 +621,7 @@ class Distinct(IR): "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY, } - def __init__(self, schema: dict, df: IR, options: Any): + def __init__(self, schema: Schema, df: IR, options: Any) -> None: self.schema = schema self.df = df (keep, subset, maintain_order, zlice) = options @@ -628,7 +630,7 @@ def __init__(self, schema: dict, df: IR, options: Any): self.stable = maintain_order self.zlice = zlice - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) if self.subset is None: @@ -667,7 +669,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return result.slice(self.zlice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Sort(IR): """Sort a dataframe.""" @@ -686,12 +688,12 @@ class Sort(IR): def __init__( self, - schema: dict, + schema: Schema, df: IR, by: list[expr.NamedExpr], options: Any, zlice: tuple[int, int] | None, - ): + ) -> None: self.schema = schema self.df = df self.by = by @@ -704,7 +706,7 @@ def __init__( plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key ) - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) sort_keys = broadcast( @@ -736,7 +738,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return DataFrame(columns).slice(self.zlice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Slice(IR): """Slice a dataframe.""" @@ -747,13 +749,13 @@ class Slice(IR): length: int """Length of the slice.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) return df.slice((self.offset, self.length)) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Filter(IR): """Filter a dataframe with a boolean mask.""" @@ -762,21 +764,21 @@ class Filter(IR): mask: expr.NamedExpr """Expression evaluating to a mask.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows) return df.filter(mask) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Projection(IR): """Select a subset of columns from a dataframe.""" df: IR """Input.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) # This can reorder things. @@ -786,7 +788,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return DataFrame(columns) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class MapFunction(IR): """Apply some function to a dataframe.""" @@ -807,7 +809,7 @@ class MapFunction(IR): ] ) - def __post_init__(self): + def __post_init__(self) -> None: """Validate preconditions.""" if self.name not in MapFunction._NAMES: raise NotImplementedError(f"Unhandled map function {self.name}") @@ -824,7 +826,7 @@ def __post_init__(self): if key_column not in self.df.dfs[0].schema: raise ValueError(f"Key column {key_column} not found") - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" if self.name == "merge_sorted": # merge_sorted operates on Union inputs @@ -876,7 +878,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: raise AssertionError("Should never be reached") -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Union(IR): """Concatenate dataframes vertically.""" @@ -885,13 +887,13 @@ class Union(IR): zlice: tuple[int, int] | None """Optional slice to apply after concatenation.""" - def __post_init__(self): + def __post_init__(self) -> None: """Validated preconditions.""" schema = self.dfs[0].schema if not all(s.schema == schema for s in self.dfs[1:]): raise ValueError("Schema mismatch") - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" # TODO: only evaluate what we need if we have a slice dfs = [df.evaluate(cache=cache) for df in self.dfs] @@ -900,14 +902,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ).slice(self.zlice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class HConcat(IR): """Concatenate dataframes horizontally.""" dfs: list[IR] """List of inputs.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] return DataFrame( @@ -915,7 +917,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class ExtContext(IR): """ Concatenate dataframes horizontally. @@ -928,7 +930,7 @@ class ExtContext(IR): extra: list[IR] """List of extra inputs.""" - def __post_init__(self): + def __post_init__(self) -> NoReturn: """Validate preconditions.""" raise NotImplementedError( "ExtContext will be deprecated, use horizontal concat instead." diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 641176daff4..38107023365 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -16,12 +16,13 @@ import cudf._lib.pylibcudf as plc from cudf_polars.dsl import expr, ir +from cudf_polars.typing import NodeTraverser from cudf_polars.utils import dtypes __all__ = ["translate_ir", "translate_named_expr"] -class set_node(AbstractContextManager): +class set_node(AbstractContextManager[None]): """ Run a block with current node set in the visitor. @@ -39,30 +40,36 @@ class set_node(AbstractContextManager): """ __slots__ = ("n", "visitor") + visitor: NodeTraverser + n: int - def __init__(self, visitor, n: int): + def __init__(self, visitor: NodeTraverser, n: int) -> None: self.visitor = visitor self.n = n - def __enter__(self): + def __enter__(self) -> None: n = self.visitor.get_node() self.visitor.set_node(self.n) self.n = n - def __exit__(self, *args): + def __exit__(self, *args: Any) -> None: self.visitor.set_node(self.n) -noop_context: nullcontext = nullcontext() +noop_context: nullcontext[None] = nullcontext() @singledispatch -def _translate_ir(node: Any, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _translate_ir( + node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: raise NotImplementedError(f"Translation for {type(node).__name__}") @_translate_ir.register -def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.PythonScan( schema, node.options, @@ -73,7 +80,9 @@ def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> @_translate_ir.register -def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Scan( schema, node.scan_type, @@ -86,13 +95,15 @@ def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Cache, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) @_translate_ir.register def _( - node: pl_ir.DataFrameScan, visitor: Any, schema: dict[str, plc.DataType] + node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.DataFrameScan( schema, @@ -105,7 +116,9 @@ def _( @_translate_ir.register -def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.expr] @@ -113,7 +126,9 @@ def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I @_translate_ir.register -def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) aggs = [translate_named_expr(visitor, n=e) for e in node.aggs] @@ -129,7 +144,9 @@ def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir. @_translate_ir.register -def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. @@ -143,7 +160,9 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] @@ -151,7 +170,9 @@ def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I @_translate_ir.register -def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.expr] @@ -159,7 +180,9 @@ def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I @_translate_ir.register -def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Distinct( schema, translate_ir(visitor, n=node.input), @@ -168,7 +191,9 @@ def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir @_translate_ir.register -def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) by = [translate_named_expr(visitor, n=e) for e in node.by_column] @@ -176,12 +201,16 @@ def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len) @_translate_ir.register -def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) mask = translate_named_expr(visitor, n=node.predicate) @@ -190,13 +219,17 @@ def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I @_translate_ir.register def _( - node: pl_ir.SimpleProjection, visitor: Any, schema: dict[str, plc.DataType] + node: pl_ir.SimpleProjection, + visitor: NodeTraverser, + schema: dict[str, plc.DataType], ) -> ir.IR: return ir.Projection(schema, translate_ir(visitor, n=node.input)) @_translate_ir.register -def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: name, *options = node.function return ir.MapFunction( schema, @@ -208,19 +241,25 @@ def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> @_translate_ir.register -def _(node: pl_ir.Union, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Union( schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options ) @_translate_ir.register -def _(node: pl_ir.HConcat, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) @_translate_ir.register -def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.ExtContext, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.ExtContext( schema, translate_ir(visitor, n=node.input), @@ -228,7 +267,7 @@ def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ) -def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: +def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: """ Translate a polars-internal IR node to our representation. @@ -249,7 +288,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: NotImplementedError If we can't translate the nodes due to unsupported functionality. """ - ctx: AbstractContextManager = ( + ctx: AbstractContextManager[None] = ( set_node(visitor, n) if n is not None else noop_context ) with ctx: @@ -258,7 +297,9 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: return _translate_ir(node, visitor, schema) -def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr: +def translate_named_expr( + visitor: NodeTraverser, *, n: pl_expr.PyExprIR +) -> expr.NamedExpr: """ Translate a polars-internal named expression IR object into our representation. @@ -289,12 +330,14 @@ def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr @singledispatch -def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _translate_expr( + node: Any, visitor: NodeTraverser, dtype: plc.DataType +) -> expr.Expr: raise NotImplementedError(f"Translation for {type(node).__name__}") @_translate_expr.register -def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): @@ -316,7 +359,7 @@ def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby? if node.partition_by is None: return expr.RollingWindow( @@ -332,19 +375,19 @@ def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) return expr.Literal(dtype, value) @_translate_expr.register -def _(node: pl_expr.Sort, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) @_translate_expr.register -def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.SortBy( dtype, node.sort_options, @@ -354,7 +397,7 @@ def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Gather( dtype, translate_expr(visitor, n=node.expr), @@ -363,7 +406,7 @@ def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Filter( dtype, translate_expr(visitor, n=node.input), @@ -372,7 +415,7 @@ def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: inner = translate_expr(visitor, n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): @@ -382,12 +425,12 @@ def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Column, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Col(dtype, node.name) @_translate_expr.register -def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Agg( dtype, node.name, @@ -397,7 +440,9 @@ def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _( + node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType +) -> expr.Expr: return expr.BinOp( dtype, expr.BinOp._MAPPING[node.op], @@ -407,11 +452,11 @@ def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Len(dtype) -def translate_expr(visitor: Any, *, n: int) -> expr.Expr: +def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: """ Translate a polars-internal expression IR into our representation. diff --git a/python/cudf_polars/cudf_polars/py.typed b/python/cudf_polars/cudf_polars/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 2fbfa971fef..2f19b41cc3a 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -28,7 +28,7 @@ def assert_gpu_result_equal( rtol: float = 1e-05, atol: float = 1e-08, categorical_as_str: bool = False, -): +) -> None: """ Assert that collection of a lazyframe on GPU produces correct results. diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py new file mode 100644 index 00000000000..287c977f4eb --- /dev/null +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Typing utilities for cudf_polars.""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import TYPE_CHECKING, Protocol, TypeAlias + +from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir + +import cudf._lib.pylibcudf as plc + +if TYPE_CHECKING: + from typing import Callable + + import polars as pl + +IR: TypeAlias = ( + pl_ir.PythonScan + | pl_ir.Scan + | pl_ir.Cache + | pl_ir.DataFrameScan + | pl_ir.Select + | pl_ir.GroupBy + | pl_ir.Join + | pl_ir.HStack + | pl_ir.Distinct + | pl_ir.Sort + | pl_ir.Slice + | pl_ir.Filter + | pl_ir.SimpleProjection + | pl_ir.MapFunction + | pl_ir.Union + | pl_ir.HConcat + | pl_ir.ExtContext +) + +Expr: TypeAlias = ( + pl_expr.Function + | pl_expr.Window + | pl_expr.Literal + | pl_expr.Sort + | pl_expr.SortBy + | pl_expr.Gather + | pl_expr.Filter + | pl_expr.Cast + | pl_expr.Column + | pl_expr.Agg + | pl_expr.BinaryExpr + | pl_expr.Len + | pl_expr.PyExprIR +) + +Schema: TypeAlias = Mapping[str, plc.DataType] + + +class NodeTraverser(Protocol): + """Abstract protocol for polars NodeTraverser.""" + + def get_node(self) -> int: + """Return current plan node id.""" + ... + + def set_node(self, n: int) -> None: + """Set the current plan node to n.""" + ... + + def view_current_node(self) -> IR: + """Convert current plan node to python rep.""" + ... + + def get_schema(self) -> Mapping[str, pl.DataType]: + """Get the schema of the current plan node.""" + ... + + def get_dtype(self, n: int) -> pl.DataType: + """Get the datatype of the given expression id.""" + ... + + def view_expression(self, n: int) -> Expr: + """Convert the given expression to python rep.""" + ... + + def set_udf( + self, + callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame], + ) -> None: + """Set the callback replacing the current node in the plan.""" + ... diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index e50ee76a9b9..2faf8c3193f 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -62,8 +62,6 @@ target-version = "py39" fix = true [tool.ruff.lint] -# __init__.py must re-export everything it imports -ignore-init-module-imports = false select = [ "E", # pycodestyle "W", # pycodestyle From 5f45803b2a68b49d330d94e2f701791a7590612a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jun 2024 13:00:12 -0700 Subject: [PATCH 056/340] Migrate quantile.pxd to pylibcudf (#15874) xref #15162 Migrate quantile.pxd to use pylibcudf APIs. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15874 --- cpp/src/quantiles/quantiles.cu | 4 +- cpp/tests/quantiles/quantiles_test.cpp | 9 +- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../api_docs/pylibcudf/quantiles.rst | 6 + .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + python/cudf/cudf/_lib/pylibcudf/quantiles.pxd | 25 ++ python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 152 ++++++++++++ python/cudf/cudf/_lib/quantiles.pyx | 102 ++------ python/cudf/cudf/pylibcudf_tests/conftest.py | 29 +++ .../cudf/pylibcudf_tests/test_quantiles.py | 234 ++++++++++++++++++ 12 files changed, 486 insertions(+), 81 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_quantiles.py diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index c0f536536ce..af3bda2e62e 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -34,6 +34,7 @@ #include #include +#include #include namespace cudf { @@ -78,7 +79,8 @@ std::unique_ptr
quantiles(table_view const& input, CUDF_EXPECTS(interp == interpolation::HIGHER || interp == interpolation::LOWER || interp == interpolation::NEAREST, - "multi-column quantiles require a non-arithmetic interpolation strategy."); + "multi-column quantiles require a non-arithmetic interpolation strategy.", + std::invalid_argument); CUDF_EXPECTS(input.num_rows() > 0, "multi-column quantiles require at least one input row."); diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp index 5b7b6dd2718..b7faa20e8c1 100644 --- a/cpp/tests/quantiles/quantiles_test.cpp +++ b/cpp/tests/quantiles/quantiles_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include #include +#include + template struct QuantilesTest : public cudf::test::BaseFixture {}; @@ -104,9 +106,10 @@ TYPED_TEST(QuantilesTest, TestMultiColumnArithmeticInterpolation) cudf::test::fixed_width_column_wrapper input_b({}); auto input = cudf::table_view({input_a}); - EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), cudf::logic_error); + EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), std::invalid_argument); - EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), cudf::logic_error); + EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), + std::invalid_argument); } TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 870ed8856d1..1e03fa80bb5 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf. join lists merge + quantiles reduce reshape rolling diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst new file mode 100644 index 00000000000..3417c1ff59d --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst @@ -0,0 +1,6 @@ +========= +quantiles +========= + +.. automodule:: cudf._lib.pylibcudf.quantiles + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 6beb7b0f506..ed396208f98 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -26,6 +26,7 @@ set(cython_sources join.pyx lists.pyx merge.pyx + quantiles.pyx reduce.pyx replace.pyx reshape.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index b289d112a90..a628ecdb038 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -12,6 +12,7 @@ from . cimport ( join, lists, merge, + quantiles, reduce, replace, reshape, @@ -48,6 +49,7 @@ __all__ = [ "join", "lists", "merge", + "quantiles", "reduce", "replace", "rolling", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 2565332f3ed..46d0fe13cd1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -12,6 +12,7 @@ join, lists, merge, + quantiles, reduce, replace, reshape, @@ -48,6 +49,7 @@ "join", "lists", "merge", + "quantiles", "reduce", "replace", "rolling", diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd new file mode 100644 index 00000000000..70ff135ca77 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted + +from .column cimport Column +from .table cimport Table + + +cpdef Column quantile( + Column input, + vector[double] q, + interpolation interp = *, + Column ordered_indices = *, + bint exact = * +) + +cpdef Table quantiles( + Table input, + vector[double] q, + interpolation interp = *, + sorted is_input_sorted = *, + list column_order = *, + list null_precedence = *, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx new file mode 100644 index 00000000000..c1f0e30ccd3 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -0,0 +1,152 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view +from cudf._lib.pylibcudf.libcudf.quantiles cimport ( + quantile as cpp_quantile, + quantiles as cpp_quantiles, +) +from cudf._lib.pylibcudf.libcudf.table.table cimport table +from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, sorted + +from .column cimport Column +from .table cimport Table +from .types cimport interpolation + + +cpdef Column quantile( + Column input, + vector[double] q, + interpolation interp = interpolation.LINEAR, + Column ordered_indices = None, + bool exact=True +): + """Computes quantiles with interpolation. + + Computes the specified quantiles by interpolating values between which they lie, + using the interpolation strategy specified in interp. + + Parameters + ---------- + input: Column + The Column to calculate quantiles on. + q: array-like that implements buffer-protocol + The quantiles to calculate in range [0,1] + interp: Interpolation, default Interpolation.LINEAR + The strategy used to select between values adjacent to a specified quantile. + ordered_indices: Column, default empty column + The column containing the sorted order of input. + + If empty, all input values are used in existing order. + Indices must be in range [0, input.size()), but are not required to be unique. + Values not indexed by this column will be ignored. + exact: bool, default True + Returns doubles if True. Otherwise, returns same type as input + + For details, see :cpp:func:`quantile`. + + Returns + ------- + Column + A Column containing specified quantiles, with nulls for indeterminable values + """ + cdef: + unique_ptr[column] c_result + column_view ordered_indices_view + + if ordered_indices is None: + ordered_indices_view = column_view() + else: + ordered_indices_view = ordered_indices.view() + + with nogil: + c_result = move( + cpp_quantile( + input.view(), + q, + interp, + ordered_indices_view, + exact, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table quantiles( + Table input, + vector[double] q, + interpolation interp = interpolation.NEAREST, + sorted is_input_sorted = sorted.NO, + list column_order = None, + list null_precedence = None, +): + """Computes row quantiles with interpolation. + + Computes the specified quantiles by retrieving the row corresponding to the + specified quantiles. In the event a quantile lies in between rows, the specified + interpolation strategy is used to pick between the rows. + + Parameters + ---------- + input: Table + The Table to calculate row quantiles on. + q: array-like + The quantiles to calculate in range [0,1] + interp: Interpolation, default Interpolation.NEAREST + The strategy used to select between values adjacent to a specified quantile. + + Must be a non-arithmetic interpolation strategy + (i.e. one of + {`Interpolation.HIGHER`, `Interpolation.LOWER`, `Interpolation.NEAREST`}) + is_input_sorted: Sorted, default Sorted.NO + Whether the input table has been pre-sorted or not. + column_order: list, default None + A list of `Order` enums, + indicating the desired sort order for each column. + By default, will sort all columns so that they are in ascending order. + + Ignored if `is_input_sorted` is `Sorted.YES` + null_precedence: list, default None + A list of `NullOrder` enums, + indicating how nulls should be sorted. + By default, will sort all columns so that nulls appear before + all other elements. + + Ignored if `is_input_sorted` is `Sorted.YES` + + For details, see :cpp:func:`quantiles`. + + Returns + ------- + Column + A Column containing specified quantiles, with nulls for indeterminable values + """ + cdef: + unique_ptr[table] c_result + vector[order] column_order_vec + vector[null_order] null_precedence_vec + + if column_order is not None: + column_order_vec = column_order + if null_precedence is not None: + null_precedence_vec = null_precedence + + with nogil: + c_result = move( + cpp_quantiles( + input.view(), + q, + interp, + is_input_sorted, + column_order_vec, + null_precedence_vec, + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 3d20454a7ce..7b50c00919a 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -3,76 +3,43 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.types cimport ( underlying_type_t_interpolation, - underlying_type_t_null_order, - underlying_type_t_order, underlying_type_t_sorted, ) from cudf._lib.types import Interpolation -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.quantiles cimport ( - quantile as cpp_quantile, - quantiles as cpp_quantile_table, -) -from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view -from cudf._lib.pylibcudf.libcudf.types cimport ( - interpolation, - null_order, - order, - sorted, -) -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted +from cudf._lib.utils cimport columns_from_pylibcudf_table + +import cudf._lib.pylibcudf as plc @acquire_spill_lock() def quantile( Column input, - object q, + vector[double] q, str interp, Column ordered_indices, bool exact, - ): - cdef column_view c_input = input.view() - cdef column_view c_ordered_indices = ( - column_view() if ordered_indices is None - else ordered_indices.view() - ) cdef interpolation c_interp = ( Interpolation[interp.upper()] ) - cdef bool c_exact = exact - - cdef vector[double] c_q - c_q.reserve(len(q)) - - for value in q: - c_q.push_back(value) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_quantile( - c_input, - c_q, - c_interp, - c_ordered_indices, - c_exact, - ) + return Column.from_pylibcudf( + plc.quantiles.quantile( + input.to_pylibcudf(mode="read"), + q, + c_interp, + ordered_indices.to_pylibcudf(mode="read"), + exact ) - - return Column.from_unique_ptr(move(c_result)) + ) def quantile_table( @@ -83,42 +50,23 @@ def quantile_table( list column_order, list null_precedence, ): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef vector[double] c_q = q + cdef interpolation c_interp = ( interp ) cdef sorted c_is_input_sorted = ( is_input_sorted ) - cdef vector[order] c_column_order - cdef vector[null_order] c_null_precedence - - c_column_order.reserve(len(column_order)) - c_null_precedence.reserve(len(null_precedence)) - - for value in column_order: - c_column_order.push_back( - ( value) - ) - for value in null_precedence: - c_null_precedence.push_back( - ( value) + return columns_from_pylibcudf_table( + plc.quantiles.quantiles( + plc.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]), + q, + c_interp, + c_is_input_sorted, + column_order, + null_precedence ) - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_quantile_table( - c_input, - c_q, - c_interp, - c_is_input_sorted, - c_column_order, - c_null_precedence, - ) - ) - - return columns_from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index 6d8284fb3db..f3c6584ef8c 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -7,6 +7,8 @@ import pyarrow as pa import pytest +import cudf._lib.pylibcudf as plc + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) from utils import DEFAULT_STRUCT_TESTING_TYPE @@ -29,3 +31,30 @@ ) def pa_type(request): return request.param + + +@pytest.fixture( + scope="session", + params=[ + pa.int64(), + pa.float64(), + pa.uint64(), + ], +) +def numeric_pa_type(request): + return request.param + + +@pytest.fixture( + scope="session", params=[opt for opt in plc.types.Interpolation] +) +def interp_opt(request): + return request.param + + +@pytest.fixture( + scope="session", + params=[opt for opt in plc.types.Sorted], +) +def sorted_opt(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py new file mode 100644 index 00000000000..a5d332a7795 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -0,0 +1,234 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq, assert_table_eq + +import cudf._lib.pylibcudf as plc + +# Map pylibcudf interpolation options to pyarrow options +interp_mapping = { + plc.types.Interpolation.LINEAR: "linear", + plc.types.Interpolation.LOWER: "lower", + plc.types.Interpolation.HIGHER: "higher", + plc.types.Interpolation.MIDPOINT: "midpoint", + plc.types.Interpolation.NEAREST: "nearest", +} + + +@pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]) +def pa_col_data(request, numeric_pa_type): + return pa.array(request.param, type=numeric_pa_type) + + +@pytest.fixture(scope="module") +def plc_col_data(pa_col_data): + return plc.interop.from_arrow(pa_col_data) + + +@pytest.fixture( + scope="module", + params=[ + { + "arrays": [[1, 2, 3, 5, 4], [5.0, 6.0, 8.0, 7.0, 9.0]], + "schema": pa.schema( + [ + ("a", pa.int64()), + ("b", pa.int64()), + ] + ), + }, + { + "arrays": [ + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [1, 2.0, 2.2, 2.3, 2.4, None, None, 3.5, 4.5, 5.5], + ], + "schema": pa.schema( + [ + ("a", pa.int64()), + ("b", pa.float64()), + ] + ), + }, + ], +) +def plc_tbl_data(request): + return plc.interop.from_arrow(pa.Table.from_arrays(**request.param)) + + +@pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]]) +@pytest.mark.parametrize("exact", [True, False]) +def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): + ordered_indices = plc.interop.from_arrow( + pc.cast(pc.sort_indices(pa_col_data), pa.int32()) + ) + res = plc.quantiles.quantile( + plc_col_data, q, interp_opt, ordered_indices, exact + ) + + pa_interp_opt = interp_mapping[interp_opt] + + if exact: + pa_col_data = pc.cast(pa_col_data, pa.float64()) + + if len(q) > 0: + # pyarrow quantile doesn't support empty q + exp = pc.quantile(pa_col_data, q=q, interpolation=pa_interp_opt) + else: + exp = pa.array([], type=pa.float64()) + + if not exact: + exp = pc.cast(exp, pa_col_data.type, safe=False) + + assert_column_eq(exp, res) + + +def _pyarrow_quantiles( + pa_tbl_data, + q, + interp_opt=plc.types.Interpolation.NEAREST, + sorted_opt=plc.types.Sorted.NO, + column_order=None, + null_precedence=None, +): + """ + The pyarrow equivalent of plc.quantiles.quantiles + + Takes the same arguments (except input should be a pyarrow table instead of + of a pylibcudf table) + + NOTE: This function doesn't support having different null precedences because of + a lack of support in pyarrow. + """ + if len(q) > 0: + # pyarrow quantile doesn't support empty q + pa_interp_opt = interp_mapping[interp_opt] + + if sorted_opt == plc.types.Sorted.NO: + order_mapper = { + plc.types.Order.ASCENDING: "ascending", + plc.types.Order.DESCENDING: "descending", + } + if null_precedence is None: + null_precedence = [plc.types.NullOrder.BEFORE] * len( + pa_tbl_data.columns + ) + if column_order is None: + column_order = [plc.types.Order.ASCENDING] * len( + pa_tbl_data.columns + ) + + if not all( + [ + null_prec == null_precedence[0] + for null_prec in null_precedence + ] + ): + raise NotImplementedError( + "Having varying null precendences is not implemented!" + ) + + pa_tbl_data = pa_tbl_data.sort_by( + [ + (name, order_mapper[order]) + for name, order in zip( + pa_tbl_data.column_names, column_order + ) + ], + null_placement="at_start" + if null_precedence[0] == plc.types.NullOrder.BEFORE + else "at_end", + ) + row_idxs = pc.quantile( + np.arange(0, len(pa_tbl_data)), q=q, interpolation=pa_interp_opt + ) + exp = pa_tbl_data.take(row_idxs) + else: + exp = pa.Table.from_arrays( + [[] for _ in range(len(pa_tbl_data.schema))], + schema=pa_tbl_data.schema, + ) + return exp + + +@pytest.mark.parametrize( + "q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]] +) +@pytest.mark.parametrize( + "column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]] +) +@pytest.mark.parametrize( + "null_precedence", + [ + [plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE], + [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], + ], +) +def test_quantiles( + plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence +): + if interp_opt in { + plc.types.Interpolation.LINEAR, + plc.types.Interpolation.MIDPOINT, + }: + pytest.skip( + "interp cannot be an arithmetic interpolation strategy for quantiles" + ) + + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + + exp = _pyarrow_quantiles( + pa_tbl_data, + q=q, + interp_opt=interp_opt, + sorted_opt=sorted_opt, + column_order=column_order, + null_precedence=null_precedence, + ) + + res = plc.quantiles.quantiles( + plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence + ) + + assert_table_eq(exp, res) + + +@pytest.mark.parametrize( + "invalid_interp", + [plc.types.Interpolation.LINEAR, plc.types.Interpolation.MIDPOINT], +) +def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp): + with pytest.raises(ValueError): + plc.quantiles.quantiles( + plc_tbl_data, q=np.array([0.1]), interp=invalid_interp + ) + + +@pytest.mark.parametrize( + "q", + [[0.1], (0.1,), np.array([0.1])], +) +def test_quantile_q_array_like(pa_col_data, plc_col_data, q): + ordered_indices = plc.interop.from_arrow( + pc.cast(pc.sort_indices(pa_col_data), pa.int32()) + ) + res = plc.quantiles.quantile( + plc_col_data, + q=q, + ordered_indices=ordered_indices, + ) + exp = pc.quantile(pa_col_data, q=q) + assert_column_eq(exp, res) + + +@pytest.mark.parametrize( + "q", + [[0.1], (0.1,), np.array([0.1])], +) +def test_quantiles_q_array_like(plc_tbl_data, q): + res = plc.quantiles.quantiles(plc_tbl_data, q=q) + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + exp = _pyarrow_quantiles(pa_tbl_data, q=q) + assert_table_eq(exp, res) From d4dd474f0db6047b2404c2c98b86cf4446445e1b Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:52:50 -0400 Subject: [PATCH 057/340] Use offsetalator in cudf::io::json::detail::parse_string (#15900) Updates the `cudf::io::json::detail::parse_string` function to use the offsetalator for building a strings column instead of `size_type` pointers. The output row sizes are computed in the first pass through the kernels and then converted to offsets. The offsets are wrapped with an offsetalator on the 2nd pass to locate each individual rows' output position in the chars data. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15900 --- cpp/src/io/utilities/data_casting.cu | 56 ++++++++++++++++------------ cpp/tests/io/json_test.cpp | 1 - 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 60cbfbc0dae..288a5690282 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -417,6 +418,7 @@ struct bitfield_block { * @param null_mask Null mask * @param null_count_data pointer to store null count * @param options Settings for controlling string processing behavior + * @param d_sizes Output size of each row * @param d_offsets Offsets to identify where to store the results for each string * @param d_chars Character array to store the characters of strings */ @@ -427,7 +429,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, bitmask_type* null_mask, size_type* null_count_data, cudf::io::parse_options_view const options, - size_type* d_offsets, + size_type* d_sizes, + cudf::detail::input_offsetalator d_offsets, char* d_chars) { constexpr auto BLOCK_SIZE = @@ -455,7 +458,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, istring = get_next_string()) { // skip nulls if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { - if (!d_chars && lane == 0) d_offsets[istring] = 0; + if (!d_chars && lane == 0) { d_sizes[istring] = 0; } continue; // gride-stride return; } @@ -476,7 +479,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, if (lane == 0) { clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[istring] = 0; + if (!d_chars) { d_sizes[istring] = 0; } } continue; // gride-stride return; } @@ -491,7 +494,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, // Copy literal/numeric value if (not is_string_value) { if (!d_chars) { - if (lane == 0) { d_offsets[istring] = in_end - in_begin; } + if (lane == 0) { d_sizes[istring] = in_end - in_begin; } } else { for (thread_index_type char_index = lane; char_index < (in_end - in_begin); char_index += BLOCK_SIZE) { @@ -621,8 +624,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); } - last_offset = 0; - d_offsets[istring] = 0; + last_offset = 0; + d_sizes[istring] = 0; } if constexpr (!is_warp) { __syncthreads(); } break; // gride-stride return; @@ -729,7 +732,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, } } } // char for-loop - if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } + if (!d_chars && lane == 0) { d_sizes[istring] = last_offset; } } // grid-stride for-loop } @@ -739,13 +742,14 @@ struct string_parse { bitmask_type* null_mask; size_type* null_count_data; cudf::io::parse_options_view const options; - size_type* d_offsets{}; + size_type* d_sizes{}; + cudf::detail::input_offsetalator d_offsets; char* d_chars{}; __device__ void operator()(size_type idx) { if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const in_begin = str_tuples[idx].first; @@ -761,7 +765,7 @@ struct string_parse { if (is_null_literal && null_mask != nullptr) { clear_bit(null_mask, idx); atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } } @@ -773,9 +777,9 @@ struct string_parse { clear_bit(null_mask, idx); atomicAdd(null_count_data, 1); } - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } } else { - if (!d_chars) d_offsets[idx] = str_process_info.bytes; + if (!d_chars) { d_sizes[idx] = str_process_info.bytes; } } } }; @@ -811,13 +815,12 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, size_type{0}, thrust::maximum{}); - auto offsets = cudf::make_numeric_column( - data_type{type_to_id()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); - auto d_offsets = offsets->mutable_view().data(); + auto sizes = rmm::device_uvector(col_size, stream); + auto d_sizes = sizes.data(); auto null_count_data = d_null_count.data(); auto single_thread_fn = string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options, d_offsets}; + str_tuples, static_cast(null_mask.data()), null_count_data, options, d_sizes}; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, @@ -838,7 +841,8 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, - d_offsets, + d_sizes, + cudf::detail::input_offsetalator{}, nullptr); } @@ -853,20 +857,22 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, - d_offsets, + d_sizes, + cudf::detail::input_offsetalator{}, nullptr); } - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= std::numeric_limits::max(), - "Size of output exceeds the column size limit", - std::overflow_error); + + auto [offsets, bytes] = + cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr); + auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); // CHARS column rmm::device_uvector chars(bytes, stream, mr); auto d_chars = chars.data(); - single_thread_fn.d_chars = d_chars; + single_thread_fn.d_chars = d_chars; + single_thread_fn.d_offsets = d_offsets; + thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, @@ -882,6 +888,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, + d_sizes, d_offsets, d_chars); } @@ -897,6 +904,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, + d_sizes, d_offsets, d_chars); } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 5d790e73246..57aa2721756 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2374,7 +2374,6 @@ TEST_F(JsonReaderTest, MapTypes) EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type"; i++; } - std::cout << "\n"; }; // json From 582d237e1b07696de86a3f4df16dca2922dda5eb Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:55:06 -0400 Subject: [PATCH 058/340] Fix offsetalator when accessing over 268 million rows (#15921) Fixes an access error when the `offsetalator` wraps an INT64 offsets column with more than 268,435,455 rows. The row access type is `size_type` and is used to calculate the appropriate position within the offsets buffer. This fix promotes the multiplication to int64 to properly resolve the correct pointer position. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15921 --- cpp/include/cudf/detail/offsets_iterator.cuh | 6 +- cpp/tests/CMakeLists.txt | 1 + .../large_strings/large_strings_fixture.cpp | 11 +++ .../large_strings/large_strings_fixture.hpp | 11 +++ .../large_strings/many_strings_tests.cpp | 67 +++++++++++++++++++ 5 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 cpp/tests/large_strings/many_strings_tests.cpp diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh index 15b334245ff..1ab1fd46230 100644 --- a/cpp/include/cudf/detail/offsets_iterator.cuh +++ b/cpp/include/cudf/detail/offsets_iterator.cuh @@ -53,7 +53,7 @@ struct input_offsetalator : base_normalator { */ __device__ inline int64_t operator[](size_type idx) const { - void const* tp = p_ + (idx * this->width_); + void const* tp = p_ + (static_cast(idx) * this->width_); return this->width_ == sizeof(int32_t) ? static_cast(*static_cast(tp)) : *static_cast(tp); } @@ -79,7 +79,7 @@ struct input_offsetalator : base_normalator { cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && "Unexpected offsets type"); #endif - p_ += (this->width_ * offset); + p_ += (this->width_ * static_cast(offset)); } protected: @@ -121,7 +121,7 @@ struct output_offsetalator : base_normalator { __device__ inline output_offsetalator const operator[](size_type idx) const { output_offsetalator tmp{*this}; - tmp.p_ += (idx * this->width_); + tmp.p_ += (static_cast(idx) * this->width_); return tmp; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a0d9083c4a4..826f879ddc0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -570,6 +570,7 @@ ConfigureTest( large_strings/concatenate_tests.cpp large_strings/case_tests.cpp large_strings/large_strings_fixture.cpp + large_strings/many_strings_tests.cpp large_strings/merge_tests.cpp large_strings/parquet_tests.cpp large_strings/reshape_tests.cpp diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp index 59e0cd43d05..416b106c5a5 100644 --- a/cpp/tests/large_strings/large_strings_fixture.cpp +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -95,6 +95,17 @@ cudf::column_view StringsLargeTest::long_column() return g_ls_data->get_column(name); } +cudf::column_view StringsLargeTest::very_long_column() +{ + std::string name("long2"); + if (!g_ls_data->has_key(name)) { + auto itr = thrust::constant_iterator("12345"); + auto input = cudf::test::strings_column_wrapper(itr, itr + 30'000'000); + g_ls_data->add_column(name, input.release()); + } + return g_ls_data->get_column(name); +} + std::unique_ptr StringsLargeTest::get_ls_data() { CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data"); diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp index 8827b65f1ce..fb7b1cd00b8 100644 --- a/cpp/tests/large_strings/large_strings_fixture.hpp +++ b/cpp/tests/large_strings/large_strings_fixture.hpp @@ -33,14 +33,25 @@ class LargeStringsData; struct StringsLargeTest : public cudf::test::BaseFixture { /** * @brief Returns a column of long strings + * + * This returns 8 rows of 400 bytes */ cudf::column_view wide_column(); /** * @brief Returns a long column of strings + * + * This returns 5 million rows of 50 bytes */ cudf::column_view long_column(); + /** + * @brief Returns a very long column of strings + * + * This returns 30 million rows of 5 bytes + */ + cudf::column_view very_long_column(); + large_strings_enabler g_ls_enabler; static LargeStringsData* g_ls_data; diff --git a/cpp/tests/large_strings/many_strings_tests.cpp b/cpp/tests/large_strings/many_strings_tests.cpp new file mode 100644 index 00000000000..73fbb21d014 --- /dev/null +++ b/cpp/tests/large_strings/many_strings_tests.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +struct StringsManyTest : public cudf::test::StringsLargeTest {}; + +TEST_F(StringsManyTest, Replace) +{ + auto const expected = this->very_long_column(); + auto const view = cudf::column_view(expected); + // force addressing (rows > max_size_type/sizeof(int64)) in a 64-bit offsets column + int constexpr max_size_type = std::numeric_limits::max(); + // minimum number of duplicates to achieve large strings (64-bit offsets) + int const min_size_multiplier = + (max_size_type / cudf::strings_column_view(view).chars_size(cudf::get_default_stream())) + 1; + // minimum row multiplier to create max_size_type/sizeof(int64) = 268,435,455 rows + int const min_row_multiplier = ((max_size_type / sizeof(int64_t)) / view.size()) + 1; + int const multiplier = std::max(min_size_multiplier, min_row_multiplier); + + std::vector input_cols(multiplier, view); + std::vector splits; + std::generate_n(std::back_inserter(splits), multiplier - 1, [view, n = 1]() mutable { + return view.size() * (n++); + }); + + auto large_input = cudf::concatenate(input_cols); // 480 million rows + auto const sv = cudf::strings_column_view(large_input->view()); + EXPECT_EQ(sv.size(), view.size() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + + // Using replace tests reading large strings as well as creating large strings + auto const target = cudf::string_scalar("3"); // fake the actual replace; + auto const repl = cudf::string_scalar("3"); // logic still builds the output + auto result = cudf::strings::replace(sv, target, repl); + + // verify results in sections + auto sliced = cudf::split(result->view(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected); + } +} From 451d12a2d8d69f63d2b9491286b8895ace6f87ba Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 6 Jun 2024 18:57:04 -0500 Subject: [PATCH 059/340] Allow anonymous user in devcontainer name. (#15784) In https://github.com/rapidsai/cudf/pull/15572, we updated the devcontainer name to include the current user's name. However, in GitHub Codespaces, the username is not defined. As a result, the container name starts with a dash. This is not allowed by GitHub Codespaces, so it fails to launch. This PR adds a default value of `anon` to the devcontainer username. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) - Paul Taylor (https://github.com/trxcllnt) URL: https://github.com/rapidsai/cudf/pull/15784 --- .devcontainer/cuda11.8-conda/devcontainer.json | 2 +- .devcontainer/cuda11.8-pip/devcontainer.json | 2 +- .devcontainer/cuda12.2-conda/devcontainer.json | 2 +- .devcontainer/cuda12.2-pip/devcontainer.json | 2 +- .github/CODEOWNERS | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index c62e18512a0..8423fe21c29 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 4ab4bd75643..4945d6cf753 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json index 2b50454410f..05bf9173d25 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.2-conda/devcontainer.json @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json index fc5abc56094..74420214726 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.2-pip/devcontainer.json @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 9efac3f1904..5e2f46714d9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -22,7 +22,7 @@ java/ @rapidsai/cudf-java-codeowners /.pre-commit-config.yaml @rapidsai/ci-codeowners #packaging code owners -/.devcontainers/ @rapidsai/packaging-codeowners +/.devcontainer/ @rapidsai/packaging-codeowners /conda/ @rapidsai/packaging-codeowners /dependencies.yaml @rapidsai/packaging-codeowners /build.sh @rapidsai/packaging-codeowners From 9bd16bb719e14ed1e0ee3edbd8c8417c03ac2f25 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:50:23 -0700 Subject: [PATCH 060/340] Reland "Fix docs for IO readers and strings_convert" (#15872)" (#15941) This reverts commit 2b031e06a7fe18eec462db445eea1c596b93a9f1. We got the go ahead to remove the text docs from @taureandyernv. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15941 --- ci/build_docs.sh | 6 ------ docs/cudf/source/libcudf_docs/api_docs/io_readers.rst | 2 +- docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst | 2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index db306046667..67a5415f353 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -46,9 +46,6 @@ pushd docs/cudf make dirhtml mkdir -p "${RAPIDS_DOCS_DIR}/cudf/html" mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html" -make text -mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt" -mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt" popd rapids-logger "Build dask-cuDF Sphinx docs" @@ -56,9 +53,6 @@ pushd docs/dask_cudf make dirhtml mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html" mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" -make text -mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt" -mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt" popd rapids-upload-docs diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst index a835673dee4..f94a5ddb403 100644 --- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst +++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst @@ -2,4 +2,4 @@ Io Readers ========== .. doxygengroup:: io_readers - :desc-only: + :members: diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst index ae5d78fb1a1..f2f320bd0e4 100644 --- a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst +++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst @@ -2,4 +2,4 @@ Strings Convert =============== .. doxygengroup:: strings_convert - :desc-only: + :members: From d83d086afda1d25f5711a0aecf4ecfe6c05f7b9d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Jun 2024 07:30:32 -1000 Subject: [PATCH 061/340] Define Column.nan_as_null to return self (#15923) While trying to clean all the `fillna` logic, I needed to have a `Column.nan_as_null` defined to make the `fillna` logic more re-useable. This allows other `nan_as_null` usages in cudf to avoiding checking whether it's defined on the column or not. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15923 --- python/cudf/cudf/core/_base_index.py | 7 +---- python/cudf/cudf/core/column/categorical.py | 6 ++-- python/cudf/cudf/core/column/column.py | 14 +++++---- python/cudf/cudf/core/column/numerical.py | 6 ++-- .../cudf/cudf/core/column/numerical_base.py | 4 +-- python/cudf/cudf/core/indexed_frame.py | 29 ++++++------------- python/cudf/cudf/core/reshape.py | 4 +-- python/cudf/cudf/tests/test_replace.py | 8 +++++ python/cudf/cudf/tests/test_series.py | 7 +++++ 9 files changed, 42 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index baca7b19e58..5d0f7c4ede4 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2072,12 +2072,7 @@ def dropna(self, how="any"): pass # This is to be consistent with IndexedFrame.dropna to handle nans # as nulls by default - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( drop_nulls( diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 1828c5ce97b..de20b2ace1d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -816,10 +816,8 @@ def to_pandas( .values_host ) - cats = col.categories - if cats.dtype.kind in "biuf": - cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] - elif not isinstance(cats.dtype, IntervalDtype): + cats = col.categories.nans_to_nulls() + if not isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 68079371b85..475d52d0fbb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -281,7 +281,7 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) - def dropna(self) -> ColumnBase: + def dropna(self) -> Self: return drop_nulls([self])[0]._with_type_metadata(self.dtype) def to_arrow(self) -> pa.Array: @@ -695,7 +695,9 @@ def fillna( Returns a copy with null filled. """ return libcudf.replace.replace_nulls( - input_col=self, replacement=fill_value, method=method + input_col=self.nans_to_nulls(), + replacement=fill_value, + method=method, )._with_type_metadata(self.dtype) def isnull(self) -> ColumnBase: @@ -1240,6 +1242,10 @@ def unary_operator(self, unaryop: str): f"Operation {unaryop} not supported for dtype {self.dtype}." ) + def nans_to_nulls(self: Self) -> Self: + """Convert NaN to NA.""" + return self + def normalize_binop_value( self, other: ScalarLike ) -> Union[ColumnBase, ScalarLike]: @@ -1802,9 +1808,7 @@ def as_column( data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) col = build_column(data, dtype=arbitrary.dtype, mask=mask) - if ( - nan_as_null or (mask is None and nan_as_null is None) - ) and col.dtype.kind == "f": + if nan_as_null or (mask is None and nan_as_null is None): col = col.nans_to_nulls() if dtype is not None: col = col.astype(dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index fb413959eb9..6fb4f17b76d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -536,7 +536,7 @@ def fillna( return col if method is not None: - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) if fill_value is None: raise ValueError("Must specify either 'fill_value' or 'method'") @@ -545,7 +545,7 @@ def fillna( isinstance(fill_value, cudf.Scalar) and fill_value.dtype == col.dtype ): - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) if np.isscalar(fill_value): # cast safely to the same dtype as self @@ -572,7 +572,7 @@ def fillna( else: fill_value = fill_value.astype(col.dtype) - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) def can_cast_safely(self, to_dtype: DtypeObj) -> bool: """ diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 541c32a2520..d38ec9cf30f 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -49,7 +49,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float: if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() # type: ignore + self = self.nans_to_nulls().dropna() if len(self) < 4: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -74,7 +74,7 @@ def skew(self, skipna: Optional[bool] = None) -> ScalarLike: if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() # type: ignore + self = self.nans_to_nulls().dropna() if len(self) < 3: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ecfcec15337..d898eb4b9c3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -420,10 +420,7 @@ def _scan(self, op, axis=None, skipna=True): results = {} for name, col in self._data.items(): if skipna: - try: - result_col = col.nans_to_nulls() - except AttributeError: - result_col = col + result_col = col.nans_to_nulls() else: if col.has_nulls(include_nan=True): first_index = col.isnull().find_first_value(True) @@ -1915,12 +1912,12 @@ def nans_to_nulls(self): 1 3.14 2 """ - result = ( - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col.copy() - for col in self._data.columns - ) + result = [] + for col in self._data.columns: + converted = col.nans_to_nulls() + if converted is col: + converted = converted.copy() + result.append(converted) return self._from_data_like_self( self._data._from_columns_like_self(result) ) @@ -4228,10 +4225,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): thresh = len(df) for name, col in df._data.items(): - try: - check_col = col.nans_to_nulls() - except AttributeError: - check_col = col + check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count ) < thresh @@ -4261,12 +4255,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): if len(subset) == 0: return self.copy(deep=True) - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index d4772d5b4c2..53239cb7ea0 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1210,9 +1210,7 @@ def _get_unique(column, dummy_na): else: unique = column.unique().sort_values() if not dummy_na: - if np.issubdtype(unique.dtype, np.floating): - unique = unique.nans_to_nulls() - unique = unique.dropna() + unique = unique.nans_to_nulls().dropna() return unique diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index d77ec596271..9466398964a 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -1370,3 +1371,10 @@ def test_fillna_columns_multiindex(): actual = gdf.fillna(10) assert_eq(expected, actual) + + +def test_fillna_nan_and_null(): + ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False) + result = ser.fillna(2.2) + expected = cudf.Series([2.2, 2.2, 1.1]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 323716d5fc3..f47c42d9a1d 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2841,3 +2841,10 @@ def test_series_from_series_index_no_shallow_copy(): ser1 = cudf.Series(range(3), index=list("abc")) ser2 = cudf.Series(ser1) assert ser1.index is ser2.index + + +@pytest.mark.parametrize("value", [1, 1.1]) +def test_nans_to_nulls_noop_copies_column(value): + ser1 = cudf.Series([value]) + ser2 = ser1.nans_to_nulls() + assert ser1._column is not ser2._column From 39c5b86645dc61bf0c59d7bf733ca13872b46a44 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 7 Jun 2024 10:53:53 -0700 Subject: [PATCH 062/340] Handling for `NaN` and `inf` when converting floating point to fixed point types (#15885) This PR adds the ability to check for `NaN` and `inf` values when converting floating point types to fixed point types. For these input values, the corresponding output will be `null`. Closes https://github.com/rapidsai/cudf/issues/15883. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/15885 --- cpp/src/unary/cast_ops.cu | 43 ++++++++++++++++++++++++++++++++-- cpp/tests/unary/cast_tests.cpp | 21 +++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 98c412f805d..64427326d87 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -15,11 +15,13 @@ */ #include +#include #include #include #include #include #include +#include #include #include #include @@ -219,6 +221,28 @@ std::unique_ptr rescale(column_view input, } }; +/** + * @brief Check if a floating point value is convertible to fixed point type. + * + * A floating point value is convertible if it is not null, not `NaN`, and not `inf`. + * + * Note that convertible input values may be out of the representable range of the target fixed + * point type. Values out of the representable range need to be checked separately. + */ +template +struct is_convertible_floating_point { + column_device_view d_input; + + bool __device__ operator()(size_type idx) const + { + static_assert(std::is_floating_point_v); + + if (d_input.is_null(idx)) { return false; } + auto const value = d_input.element(idx); + return std::isfinite(value); + } +}; + template struct dispatch_unary_cast_to { column_view input; @@ -294,8 +318,8 @@ struct dispatch_unary_cast_to { std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - detail::copy_bitmask(input, stream, mr), - input.null_count()); + rmm::device_buffer{}, + 0); mutable_column_view output_mutable = *output; @@ -308,6 +332,21 @@ struct dispatch_unary_cast_to { output_mutable.begin(), fixed_point_unary_cast{scale}); + if constexpr (cudf::is_floating_point()) { + // For floating-point values, beside input nulls, we also need to set nulls for the output + // rows corresponding to NaN and inf in the input. + auto const d_input_ptr = column_device_view::create(input, stream); + auto [null_mask, null_count] = + cudf::detail::valid_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(size), + is_convertible_floating_point{*d_input_ptr}, + stream, + mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } + } else { + output->set_null_mask(detail::copy_bitmask(input, stream, mr), input.null_count()); + } + return output; } diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index a82449ffc10..ebeafc82039 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -665,6 +665,27 @@ TYPED_TEST(FixedPointTests, CastFromDouble) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } +TYPED_TEST(FixedPointTests, CastFromDoubleWithNaNAndInf) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + using fw_wrapper = cudf::test::fixed_width_column_wrapper; + + auto const NaN = std::numeric_limits::quiet_NaN(); + auto const inf = std::numeric_limits::infinity(); + auto const null = 0; + + auto const input = fw_wrapper{1.729, -inf, NaN, 172.9, -inf, NaN, inf, 1.23, inf}; + auto const expected = fp_wrapper{{1729, null, null, 172900, null, null, null, 1230, null}, + {true, false, false, true, false, false, false, true, false}, + scale_type{-3}}; + auto const result = cudf::cast(input, make_fixed_point_data_type(-3)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + TYPED_TEST(FixedPointTests, CastFromDoubleLarge) { using namespace numeric; From 0067444597127f23a09a349f1c97dc33b9ec3958 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 7 Jun 2024 16:10:22 -0400 Subject: [PATCH 063/340] cudf.pandas documentation improvement (#15948) Added some more about the generality of the fast-slow proxy scheme from a suggestion from @wence- Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15948 --- docs/cudf/source/developer_guide/cudf_pandas.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index aeb43f66b2d..827ba18a4a4 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -3,8 +3,16 @@ The use of the cuDF pandas accelerator mode (`cudf.pandas`) is explained [in the The purpose of this document is to explain how the fast-slow proxy mechanism works and document internal environment variables that can be used to debug `cudf.pandas` itself. ## fast-slow proxy mechanism -`cudf.pandas` works by wrapping each Pandas type and its corresponding cuDF type in a new proxy type also known as a fast-slow proxy type. -The purpose of proxy types is to attempt computations on the fast (cuDF) object first, and then fall back to running on the slow (Pandas) object if the fast version fails. +The core of `cudf.pandas` is implemented through proxy types defined in [`fast_slow_proxy.py`](https://github.com/rapidsai/cudf/blob/5f45803b2a68b49d330d94e2f701791a7590612a/python/cudf/cudf/pandas/fast_slow_proxy.py), which link a pair of "fast" and "slow" libraries. +`cudf.pandas` works by wrapping each "slow" type and its corresponding "fast" type in a new proxy type, also known as a fast-slow proxy type. +The purpose of these proxy types is so we can first attempt computations on the fast object, and then fall back to the slow object if the fast version fails. +While the core wrapping functionality is generic, the current usage mainly involves providing a proxy pair using cuDF and Pandas. +In the rest of this document, to maintain a concrete pair of libraries in mind, we use cuDF and Pandas interchangeably as names for the "fast" and "slow" libraries, respectively, with the understanding that any pair of API-matching libraries could be used. +For example, future support could include pairs such as CuPy (as the "fast" library) and NumPy (as the "slow" library). + +```{note} +We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +``` ### Types: #### Wrapped Types and Proxy Types From 139ed6c3085feac8116085e35c7897cad141ce69 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Jun 2024 10:49:05 -1000 Subject: [PATCH 064/340] Add __array_interface__ to cudf.pandas numpy.ndarray proxy (#15936) closes #15926 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/15936 --- python/cudf/cudf/pandas/_wrappers/common.py | 5 +++++ python/cudf/cudf/pandas/_wrappers/numpy.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py index 468c5687c15..66a51a83896 100644 --- a/python/cudf/cudf/pandas/_wrappers/common.py +++ b/python/cudf/cudf/pandas/_wrappers/common.py @@ -46,5 +46,10 @@ def cuda_array_interface(self: _FastSlowProxy): return self._fsproxy_fast.__cuda_array_interface__ +@property # type: ignore +def array_interface(self: _FastSlowProxy): + return self._fsproxy_slow.__array_interface__ + + def custom_iter(self: _FastSlowProxy): return iter(self._fsproxy_slow) diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py index 94298872213..c445be46f58 100644 --- a/python/cudf/cudf/pandas/_wrappers/numpy.py +++ b/python/cudf/cudf/pandas/_wrappers/numpy.py @@ -15,6 +15,7 @@ make_intermediate_proxy_type, ) from .common import ( + array_interface, array_method, arrow_array_method, cuda_array_interface, @@ -115,6 +116,7 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor): # So that pa.array(wrapped-numpy-array) works "__arrow_array__": arrow_array_method, "__cuda_array_interface__": cuda_array_interface, + "__array_interface__": array_interface, # ndarrays are unhashable "__hash__": None, # iter(cupy-array) produces an iterable of zero-dim device From 8e40fe7e6b01a399c3ea406a59d4cbcbc9bfce5c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 16:08:42 -0700 Subject: [PATCH 065/340] Remove unused parsing utilities (#15955) Some parsing utilities have been unused since legacy JSON removal. This PR removes these functions. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15955 --- cpp/CMakeLists.txt | 1 - cpp/src/io/utilities/parsing_utils.cu | 221 ------------------------- cpp/src/io/utilities/parsing_utils.cuh | 76 --------- 3 files changed, 298 deletions(-) delete mode 100644 cpp/src/io/utilities/parsing_utils.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f637db66c2c..ca85996b990 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -439,7 +439,6 @@ add_library( src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp src/io/utilities/file_io_utilities.cpp - src/io/utilities/parsing_utils.cu src/io/utilities/row_selection.cpp src/io/utilities/type_inference.cu src/io/utilities/trie.cu diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu deleted file mode 100644 index cb8be380c5b..00000000000 --- a/cpp/src/io/utilities/parsing_utils.cu +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include - -#include - -#include - -namespace cudf { -namespace io { -namespace { -// When processing the input in chunks, this is the maximum size of each chunk. -// Only one chunk is loaded on the GPU at a time, so this value is chosen to -// be small enough to fit on the GPU in most cases. -constexpr size_t max_chunk_bytes = 256 * 1024 * 1024; // 256MB - -constexpr int bytes_per_find_thread = 64; - -using pos_key_pair = thrust::pair; - -template -constexpr T divCeil(T dividend, T divisor) noexcept -{ - return (dividend + divisor - 1) / divisor; -} - -/** - * @brief Sets the specified element of the array to the passed value - */ -template -__device__ __forceinline__ void setElement(T* array, cudf::size_type idx, T const& t, V const&) -{ - array[idx] = t; -} - -/** - * @brief Sets the specified element of the array of pairs using the two passed - * parameters. - */ -template -__device__ __forceinline__ void setElement(thrust::pair* array, - cudf::size_type idx, - T const& t, - V const& v) -{ - array[idx] = {t, v}; -} - -/** - * @brief Overloads the setElement() functions for void* arrays. - * Does not do anything, indexing is not allowed with void* arrays. - */ -template -__device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V const&) -{ -} - -/** - * @brief CUDA kernel that finds all occurrences of a character in the given - * character array. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output array. - * - * @param[in] data Pointer to the input character array - * @param[in] size Number of bytes in the input array - * @param[in] offset Offset to add to the output positions - * @param[in] key Character to find in the array - * @param[in,out] count Pointer to the number of found occurrences - * @param[out] positions Array containing the output positions - */ -template -CUDF_KERNEL void count_and_set_positions(char const* data, - uint64_t size, - uint64_t offset, - char const key, - cudf::size_type* count, - T* positions) -{ - // thread IDs range per block, so also need the block id - auto const tid = cudf::detail::grid_1d::global_thread_id(); - auto const did = tid * bytes_per_find_thread; - - char const* raw = (data + did); - - long const byteToProcess = - ((did + bytes_per_find_thread) < size) ? bytes_per_find_thread : (size - did); - - // Process the data - for (long i = 0; i < byteToProcess; i++) { - if (raw[i] == key) { - auto const idx = atomicAdd(count, static_cast(1)); - setElement(positions, idx, did + offset + i, key); - } - } -} - -} // namespace - -template -cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream) -{ - int block_size = 0; // suggested thread count to use - int min_grid_size = 0; // minimum block count required - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); - int const grid_size = divCeil(data.size(), (size_t)block_size); - - auto d_count = cudf::detail::make_zeroed_device_uvector_async( - 1, stream, rmm::mr::get_current_device_resource()); - for (char key : keys) { - count_and_set_positions<<>>( - data.data(), data.size(), result_offset, key, d_count.data(), positions); - } - - return cudf::detail::make_std_vector_sync(d_count, stream)[0]; -} - -template -cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream) -{ - rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream); - auto d_count = cudf::detail::make_zeroed_device_uvector_async( - 1, stream, rmm::mr::get_current_device_resource()); - - int block_size = 0; // suggested thread count to use - int min_grid_size = 0; // minimum block count required - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); - - size_t const chunk_count = divCeil(data.size(), max_chunk_bytes); - for (size_t ci = 0; ci < chunk_count; ++ci) { - auto const chunk_offset = ci * max_chunk_bytes; - auto const h_chunk = data.data() + chunk_offset; - int const chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes); - auto const chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread); - int const grid_size = divCeil(chunk_bits, block_size); - - // Copy chunk to device - CUDF_CUDA_TRY( - cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value())); - - for (char key : keys) { - count_and_set_positions - <<>>(static_cast(d_chunk.data()), - chunk_bytes, - chunk_offset + result_offset, - key, - d_count.data(), - positions); - } - } - - return cudf::detail::make_std_vector_sync(d_count, stream)[0]; -} - -template cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - uint64_t* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - pos_key_pair* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - uint64_t* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - pos_key_pair* positions, - rmm::cuda_stream_view stream); - -cudf::size_type count_all_from_set(device_span data, - std::vector const& keys, - rmm::cuda_stream_view stream) -{ - return find_all_from_set(data, keys, 0, nullptr, stream); -} - -cudf::size_type count_all_from_set(host_span data, - std::vector const& keys, - rmm::cuda_stream_view stream) -{ - return find_all_from_set(data, keys, 0, nullptr, stream); -} - -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index faee05541cc..bc2722441d0 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -414,82 +414,6 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const* } // namespace gpu -/** - * @brief Searches the input character array for each of characters in a set. - * Sums up the number of occurrences. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output device array. - * - * @param[in] d_data Input character array in device memory - * @param[in] keys Vector containing the keys to count in the buffer - * @param[in] result_offset Offset to add to the output positions - * @param[out] positions Array containing the output positions - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -template -cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set. - * Sums up the number of occurrences. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output device array. - * - * Does not load the entire file into the GPU memory at any time, so it can - * be used to parse large files. Output array needs to be preallocated. - * - * @param[in] h_data Pointer to the input character array - * @param[in] h_size Number of bytes in the input array - * @param[in] keys Vector containing the keys to count in the buffer - * @param[in] result_offset Offset to add to the output positions - * @param[out] positions Array containing the output positions - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -template -cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set - * and sums up the number of occurrences. - * - * @param d_data Input data buffer in device memory - * @param keys Vector containing the keys to count in the buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -cudf::size_type count_all_from_set(device_span data, - std::vector const& keys, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set - * and sums up the number of occurrences. - * - * Does not load the entire buffer into the GPU memory at any time, so it can - * be used with buffers of any size. - * - * @param h_data Pointer to the data in host memory - * @param h_size Size of the input data, in bytes - * @param keys Vector containing the keys to count in the buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -cudf::size_type count_all_from_set(host_span data, - std::vector const& keys, - rmm::cuda_stream_view stream); - /** * @brief Checks whether the given character is a whitespace character. * From bfad68c66fba06cb87327265b8b74ab329c58e4e Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Sun, 9 Jun 2024 09:17:12 -0400 Subject: [PATCH 066/340] Add an Environment Variable for debugging the fast path in cudf.pandas (#15837) Part of #14975 This PR adds a pandas debugging option to `_fast_slow_function_call` that runs the slow path after the fast and returns a warning if the results differ. Authors: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15837 --- python/cudf/cudf/pandas/fast_slow_proxy.py | 63 ++++++++++++++++-- .../cudf_pandas_tests/test_cudf_pandas.py | 64 ++++++++++++++++++- 2 files changed, 121 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 169dd80e132..5f4cf2e6cc6 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -9,6 +9,7 @@ import operator import pickle import types +import warnings from collections.abc import Iterator from enum import IntEnum from typing import ( @@ -23,6 +24,10 @@ Type, ) +import numpy as np + +from ..options import _env_get_bool +from ..testing._utils import assert_eq from .annotation import nvtx @@ -808,7 +813,9 @@ def __get__(self, instance, owner) -> Any: else: # for anything else, use a fast-slow attribute: self._attr, _ = _fast_slow_function_call( - getattr, owner, self._name + getattr, + owner, + self._name, ) if isinstance( @@ -829,9 +836,11 @@ def __get__(self, instance, owner) -> Any: getattr(instance._fsproxy_slow, self._name), None, # type: ignore ) - return _fast_slow_function_call(getattr, instance, self._name)[ - 0 - ] + return _fast_slow_function_call( + getattr, + instance, + self._name, + )[0] return self._attr @@ -866,7 +875,17 @@ def __name__(self, value): setattr(self._fsproxy_slow, "__name__", value) -def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any: +def _assert_fast_slow_eq(left, right): + if _is_final_type(type(left)) or type(left) in NUMPY_TYPES: + assert_eq(left, right) + + +def _fast_slow_function_call( + func: Callable, + /, + *args, + **kwargs, +) -> Any: """ Call `func` with all `args` and `kwargs` converted to their respective fast type. If that fails, call `func` with all @@ -890,6 +909,37 @@ def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any: # try slow path raise Exception() fast = True + if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): + try: + with nvtx.annotate( + "EXECUTE_SLOW_DEBUG", + color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], + domain="cudf_pandas", + ): + slow_args, slow_kwargs = ( + _slow_arg(args), + _slow_arg(kwargs), + ) + with disable_module_accelerator(): + slow_result = func(*slow_args, **slow_kwargs) + except Exception as e: + warnings.warn( + "The result from pandas could not be computed. " + f"The exception was {e}." + ) + else: + try: + _assert_fast_slow_eq(result, slow_result) + except AssertionError as e: + warnings.warn( + "The results from cudf and pandas were different. " + f"The exception was {e}." + ) + except Exception as e: + warnings.warn( + "Pandas debugging mode failed. " + f"The exception was {e}." + ) except Exception: with nvtx.annotate( "EXECUTE_SLOW", @@ -1135,6 +1185,9 @@ def _replace_closurevars( ) +NUMPY_TYPES: Set[str] = set(np.sctypeDict.values()) + + _SPECIAL_METHODS: Set[str] = { "__abs__", "__add__", diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index fef829b17fc..72e9ad5fca3 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -41,8 +41,9 @@ get_calendar, ) -# Accelerated pandas has the real pandas module as an attribute +# Accelerated pandas has the real pandas and cudf modules as attributes pd = xpd._fsproxy_slow +cudf = xpd._fsproxy_fast @pytest.fixture @@ -1424,5 +1425,66 @@ def test_holidays_within_dates(holiday, start, expected): ) == [utc.localize(dt) for dt in expected] +def test_cudf_pandas_debugging_different_results(monkeypatch): + cudf_mean = cudf.Series.mean + + def mock_mean_one(self, *args, **kwargs): + return np.float64(1.0) + + with monkeypatch.context() as monkeycontext: + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", mock_mean_one) + monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True") + s = xpd.Series([1, 2]) + with pytest.warns( + UserWarning, + match="The results from cudf and pandas were different.", + ): + assert s.mean() == 1.0 + # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts. + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", cudf_mean) + + +def test_cudf_pandas_debugging_pandas_error(monkeypatch): + pd_mean = pd.Series.mean + + def mock_mean_exception(self, *args, **kwargs): + raise Exception() + + with monkeypatch.context() as monkeycontext: + monkeycontext.setattr( + xpd.Series.mean, "_fsproxy_slow", mock_mean_exception + ) + monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True") + s = xpd.Series([1, 2]) + with pytest.warns( + UserWarning, + match="The result from pandas could not be computed.", + ): + s = xpd.Series([1, 2]) + assert s.mean() == 1.5 + # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts. + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_slow", pd_mean) + + +def test_cudf_pandas_debugging_failed(monkeypatch): + pd_mean = pd.Series.mean + + def mock_mean_none(self, *args, **kwargs): + return None + + with monkeypatch.context() as monkeycontext: + monkeycontext.setattr(xpd.Series.mean, "_fsproxy_slow", mock_mean_none) + monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True") + s = xpd.Series([1, 2]) + with pytest.warns( + UserWarning, + match="Pandas debugging mode failed.", + ): + s = xpd.Series([1, 2]) + assert s.mean() == 1.5 + # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts. + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_slow", pd_mean) + + def test_excelwriter_pathlike(): assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike) From c02260f2fb1c162eabf0da0604cc6f08f2cc74ff Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sun, 9 Jun 2024 22:09:44 -0700 Subject: [PATCH 067/340] Refactor Parquet writer options and builders (#15831) Adding options to the Parquet writer is made somewhat tedious by the duplication of code between the two current sets of options/builder classes; one each for the chunked and non-chunked Parquet writers. This PR pulls common options into a parent options class, and common setters into a parent builder class. The builder parent uses CRTP to allow chaining of options. Authors: - Ed Seidl (https://github.com/etseidl) - Vyas Ramasubramani (https://github.com/vyasr) - Mike Wilson (https://github.com/hyperbolic2346) - Muhammad Haseeb (https://github.com/mhaseeb123) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) - Muhammad Haseeb (https://github.com/mhaseeb123) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/15831 --- cpp/include/cudf/io/parquet.hpp | 906 ++++-------------- cpp/src/io/functions.cpp | 271 ++++-- .../_lib/pylibcudf/libcudf/io/parquet.pxd | 173 ++-- 3 files changed, 410 insertions(+), 940 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index b2f949cdcee..51eeed5b721 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include namespace cudf::io { @@ -576,22 +577,16 @@ struct sorting_column { bool is_nulls_first{true}; //!< true if nulls come before non-null values }; -class parquet_writer_options_builder; - /** - * @brief Settings for `write_parquet()`. + * @brief Base settings for `write_parquet()` and `parquet_chunked_writer`. */ -class parquet_writer_options { +class parquet_writer_options_base { // Specify the sink to use for writer output sink_info _sink; // Specify the compression format to use compression_type _compression = compression_type::SNAPPY; // Specify the level of statistics in the output file statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP; - // Sets of columns to output - table_view _table; - // Partitions described as {start_row, num_rows} pairs - std::vector _partitions; // Optional associated metadata std::optional _metadata; // Optional footer key_value_metadata @@ -602,8 +597,6 @@ class parquet_writer_options { // Parquet writer can write timestamps as UTC // Defaults to true because libcudf timestamps are implicitly UTC bool _write_timestamps_as_UTC = true; - // Column chunks file paths to be set in the raw output metadata. One per output file - std::vector _column_chunks_file_paths; // Maximum size of each row group (unless smaller than a single page) size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) @@ -627,18 +620,13 @@ class parquet_writer_options { // Which columns in _table are used for sorting std::optional> _sorting_columns; + protected: /** - * @brief Constructor from sink and table. + * @brief Constructor from sink. * * @param sink The sink used for writer output - * @param table Table to be written to output */ - explicit parquet_writer_options(sink_info const& sink, table_view const& table) - : _sink(sink), _table(table) - { - } - - friend parquet_writer_options_builder; + explicit parquet_writer_options_base(sink_info const& sink) : _sink(sink) {} public: /** @@ -646,24 +634,7 @@ class parquet_writer_options { * * This has been added since Cython requires a default constructor to create objects on stack. */ - parquet_writer_options() = default; - - /** - * @brief Create builder to create `parquet_writer_options`. - * - * @param sink The sink used for writer output - * @param table Table to be written to output - * - * @return Builder to build parquet_writer_options - */ - static parquet_writer_options_builder builder(sink_info const& sink, table_view const& table); - - /** - * @brief Create builder to create `parquet_writer_options`. - * - * @return parquet_writer_options_builder - */ - static parquet_writer_options_builder builder(); + parquet_writer_options_base() = default; /** * @brief Returns sink info. @@ -686,20 +657,6 @@ class parquet_writer_options { */ [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; } - /** - * @brief Returns table_view. - * - * @return Table view - */ - [[nodiscard]] table_view get_table() const { return _table; } - - /** - * @brief Returns partitions. - * - * @return Partitions - */ - [[nodiscard]] std::vector const& get_partitions() const { return _partitions; } - /** * @brief Returns associated metadata. * @@ -712,7 +669,8 @@ class parquet_writer_options { * * @return Key-Value footer metadata information */ - std::vector> const& get_key_value_metadata() const + [[nodiscard]] std::vector> const& get_key_value_metadata() + const { return _user_data; } @@ -722,7 +680,7 @@ class parquet_writer_options { * * @return `true` if timestamps will be written as INT96 */ - bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } + [[nodiscard]] bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } /** * @brief Returns `true` if timestamps will be written as UTC @@ -731,29 +689,19 @@ class parquet_writer_options { */ [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } - /** - * @brief Returns Column chunks file paths to be set in the raw output metadata. - * - * @return Column chunks file paths to be set in the raw output metadata - */ - std::vector const& get_column_chunks_file_paths() const - { - return _column_chunks_file_paths; - } - /** * @brief Returns maximum row group size, in bytes. * * @return Maximum row group size, in bytes */ - auto get_row_group_size_bytes() const { return _row_group_size_bytes; } + [[nodiscard]] auto get_row_group_size_bytes() const { return _row_group_size_bytes; } /** * @brief Returns maximum row group size, in rows. * * @return Maximum row group size, in rows */ - auto get_row_group_size_rows() const { return _row_group_size_rows; } + [[nodiscard]] auto get_row_group_size_rows() const { return _row_group_size_rows; } /** * @brief Returns the maximum uncompressed page size, in bytes. @@ -762,7 +710,7 @@ class parquet_writer_options { * * @return Maximum uncompressed page size, in bytes */ - auto get_max_page_size_bytes() const + [[nodiscard]] auto get_max_page_size_bytes() const { return std::min(_max_page_size_bytes, get_row_group_size_bytes()); } @@ -774,7 +722,7 @@ class parquet_writer_options { * * @return Maximum page size, in rows */ - auto get_max_page_size_rows() const + [[nodiscard]] auto get_max_page_size_rows() const { return std::min(_max_page_size_rows, get_row_group_size_rows()); } @@ -784,7 +732,10 @@ class parquet_writer_options { * * @return length min/max will be truncated to */ - auto get_column_index_truncate_length() const { return _column_index_truncate_length; } + [[nodiscard]] auto get_column_index_truncate_length() const + { + return _column_index_truncate_length; + } /** * @brief Returns policy for dictionary use. @@ -831,20 +782,12 @@ class parquet_writer_options { */ [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; } - /** - * @brief Sets partitions. - * - * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must - * be same size as number of sinks in sink_info - */ - void set_partitions(std::vector partitions); - /** * @brief Sets metadata. * * @param metadata Associated metadata */ - void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); } + void set_metadata(table_input_metadata metadata); /** * @brief Sets metadata. @@ -858,14 +801,13 @@ class parquet_writer_options { * * @param sf Level of statistics requested in the output file */ - void set_stats_level(statistics_freq sf) { _stats_level = sf; } - + void set_stats_level(statistics_freq sf); /** * @brief Sets compression type. * * @param compression The compression type to use */ - void set_compression(compression_type compression) { _compression = compression; } + void set_compression(compression_type compression); /** * @brief Sets timestamp writing preferences. INT96 timestamps will be written @@ -873,22 +815,14 @@ class parquet_writer_options { * * @param req Boolean value to enable/disable writing of INT96 timestamps */ - void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; } + void enable_int96_timestamps(bool req); /** * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`. * * @param val Boolean value to enable/disable writing of timestamps as UTC. */ - void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; } - - /** - * @brief Sets column chunks file path to be set in the raw output metadata. - * - * @param file_paths Vector of Strings which indicates file path. Must be same size as number of - * data sinks in sink info - */ - void set_column_chunks_file_paths(std::vector file_paths); + void enable_utc_timestamps(bool val); /** * @brief Sets the maximum row group size, in bytes. @@ -951,116 +885,84 @@ class parquet_writer_options { * * @param comp_stats Pointer to compression statistics to be updated after writing */ - void set_compression_statistics(std::shared_ptr comp_stats) - { - _compression_stats = std::move(comp_stats); - } + void set_compression_statistics(std::shared_ptr comp_stats); /** * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`. * * @param val Boolean value to enable/disable writing of V2 page headers. */ - void enable_write_v2_headers(bool val) { _v2_page_headers = val; } + void enable_write_v2_headers(bool val); /** * @brief Sets sorting columns. * * @param sorting_columns Column sort order metadata */ - void set_sorting_columns(std::vector sorting_columns) - { - _sorting_columns = std::move(sorting_columns); - } + void set_sorting_columns(std::vector sorting_columns); }; /** - * @brief Class to build `parquet_writer_options`. + * @brief Base class for Parquet options builders. */ -class parquet_writer_options_builder { - parquet_writer_options options; +template +class parquet_writer_options_builder_base { + OptionsT _options; - public: + protected: /** - * @brief Default constructor. + * @brief Return reference to the options object being built * - * This has been added since Cython requires a default constructor to create objects on stack. + * @return the options object */ - explicit parquet_writer_options_builder() = default; + inline OptionsT& get_options() { return _options; } /** - * @brief Constructor from sink and table. + * @brief Constructor from options. * - * @param sink The sink used for writer output - * @param table Table to be written to output + * @param options Options object to build */ - explicit parquet_writer_options_builder(sink_info const& sink, table_view const& table) - : options(sink, table) - { - } + explicit parquet_writer_options_builder_base(OptionsT options); + public: /** - * @brief Sets partitions in parquet_writer_options. + * @brief Default constructor. * - * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must - * be same size as number of sinks in sink_info - * @return this for chaining + * This has been added since Cython requires a default constructor to create objects on stack. */ - parquet_writer_options_builder& partitions(std::vector partitions); + explicit parquet_writer_options_builder_base() = default; /** - * @brief Sets metadata in parquet_writer_options. + * @brief Sets metadata. * * @param metadata Associated metadata * @return this for chaining */ - parquet_writer_options_builder& metadata(table_input_metadata metadata) - { - options._metadata = std::move(metadata); - return *this; - } + BuilderT& metadata(table_input_metadata metadata); /** - * @brief Sets Key-Value footer metadata in parquet_writer_options. + * @brief Sets Key-Value footer metadata. * * @param metadata Key-Value footer metadata * @return this for chaining */ - parquet_writer_options_builder& key_value_metadata( - std::vector> metadata); + BuilderT& key_value_metadata(std::vector> metadata); /** - * @brief Sets the level of statistics in parquet_writer_options. + * @brief Sets the level of statistics. * * @param sf Level of statistics requested in the output file * @return this for chaining */ - parquet_writer_options_builder& stats_level(statistics_freq sf) - { - options._stats_level = sf; - return *this; - } + BuilderT& stats_level(statistics_freq sf); /** - * @brief Sets compression type in parquet_writer_options. + * @brief Sets compression type. * * @param compression The compression type to use * @return this for chaining */ - parquet_writer_options_builder& compression(compression_type compression) - { - options._compression = compression; - return *this; - } - - /** - * @brief Sets column chunks file path to be set in the raw output metadata. - * - * @param file_paths Vector of Strings which indicates file path. Must be same size as number of - * data sinks - * @return this for chaining - */ - parquet_writer_options_builder& column_chunks_file_paths(std::vector file_paths); + BuilderT& compression(compression_type compression); /** * @brief Sets the maximum row group size, in bytes. @@ -1068,11 +970,7 @@ class parquet_writer_options_builder { * @param val maximum row group size * @return this for chaining */ - parquet_writer_options_builder& row_group_size_bytes(size_t val) - { - options.set_row_group_size_bytes(val); - return *this; - } + BuilderT& row_group_size_bytes(size_t val); /** * @brief Sets the maximum number of rows in output row groups. @@ -1080,11 +978,7 @@ class parquet_writer_options_builder { * @param val maximum number or rows * @return this for chaining */ - parquet_writer_options_builder& row_group_size_rows(size_type val) - { - options.set_row_group_size_rows(val); - return *this; - } + BuilderT& row_group_size_rows(size_type val); /** * @brief Sets the maximum uncompressed page size, in bytes. @@ -1096,11 +990,7 @@ class parquet_writer_options_builder { * @param val maximum page size * @return this for chaining */ - parquet_writer_options_builder& max_page_size_bytes(size_t val) - { - options.set_max_page_size_bytes(val); - return *this; - } + BuilderT& max_page_size_bytes(size_t val); /** * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. @@ -1109,11 +999,7 @@ class parquet_writer_options_builder { * @param val maximum rows per page * @return this for chaining */ - parquet_writer_options_builder& max_page_size_rows(size_type val) - { - options.set_max_page_size_rows(val); - return *this; - } + BuilderT& max_page_size_rows(size_type val); /** * @brief Sets the desired maximum size in bytes for min and max values in the column index. @@ -1128,11 +1014,7 @@ class parquet_writer_options_builder { * @param val length min/max will be truncated to, with 0 indicating no truncation * @return this for chaining */ - parquet_writer_options_builder& column_index_truncate_length(int32_t val) - { - options.set_column_index_truncate_length(val); - return *this; - } + BuilderT& column_index_truncate_length(int32_t val); /** * @brief Sets the policy for dictionary use. @@ -1151,7 +1033,7 @@ class parquet_writer_options_builder { * @param val policy for dictionary use * @return this for chaining */ - parquet_writer_options_builder& dictionary_policy(enum dictionary_policy val); + BuilderT& dictionary_policy(enum dictionary_policy val); /** * @brief Sets the maximum dictionary size, in bytes. @@ -1164,7 +1046,7 @@ class parquet_writer_options_builder { * @param val maximum dictionary size * @return this for chaining */ - parquet_writer_options_builder& max_dictionary_size(size_t val); + BuilderT& max_dictionary_size(size_t val); /** * @brief Sets the maximum page fragment size, in rows. @@ -1176,7 +1058,7 @@ class parquet_writer_options_builder { * @param val maximum page fragment size * @return this for chaining */ - parquet_writer_options_builder& max_page_fragment_size(size_type val); + BuilderT& max_page_fragment_size(size_type val); /** * @brief Sets the pointer to the output compression statistics. @@ -1184,24 +1066,16 @@ class parquet_writer_options_builder { * @param comp_stats Pointer to compression statistics to be filled once writer is done * @return this for chaining */ - parquet_writer_options_builder& compression_statistics( - std::shared_ptr const& comp_stats) - { - options._compression_stats = comp_stats; - return *this; - } + BuilderT& compression_statistics( + std::shared_ptr const& comp_stats); /** - * @brief Sets whether int96 timestamps are written or not in parquet_writer_options. + * @brief Sets whether int96 timestamps are written or not. * * @param enabled Boolean value to enable/disable int96 timestamps * @return this for chaining */ - parquet_writer_options_builder& int96_timestamps(bool enabled) - { - options._write_timestamps_as_int96 = enabled; - return *this; - } + BuilderT& int96_timestamps(bool enabled); /** * @brief Set to true if timestamps are to be written as UTC. @@ -1209,126 +1083,60 @@ class parquet_writer_options_builder { * @param enabled Boolean value to enable/disable writing of timestamps as UTC. * @return this for chaining */ - parquet_writer_options_builder& utc_timestamps(bool enabled) - { - options._write_timestamps_as_UTC = enabled; - return *this; - } - + BuilderT& utc_timestamps(bool enabled); /** * @brief Set to true if V2 page headers are to be written. * * @param enabled Boolean value to enable/disable writing of V2 page headers. * @return this for chaining */ - parquet_writer_options_builder& write_v2_headers(bool enabled); + BuilderT& write_v2_headers(bool enabled); /** - * @brief Sets column sorting metadata to chunked_parquet_writer_options. + * @brief Sets column sorting metadata. * * @param sorting_columns Column sort order metadata * @return this for chaining */ - parquet_writer_options_builder& sorting_columns(std::vector sorting_columns); + BuilderT& sorting_columns(std::vector sorting_columns); /** - * @brief move parquet_writer_options member once it's built. + * @brief move options member once it's built. */ - operator parquet_writer_options&&() { return std::move(options); } + operator OptionsT&&(); /** - * @brief move parquet_writer_options member once it's built. + * @brief move options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. * * @return Built `parquet_writer_options` object's r-value reference */ - parquet_writer_options&& build() { return std::move(options); } + OptionsT&& build(); }; -/** - * @brief Writes a set of columns to parquet format. - * - * The following code snippet demonstrates how to write columns to a file: - * @code - * auto destination = cudf::io::sink_info("dataset.parquet"); - * auto options = cudf::io::parquet_writer_options::builder(destination, table->view()); - * cudf::io::write_parquet(options); - * @endcode - * - * @param options Settings for controlling writing behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if - * requested in parquet_writer_options (empty blob otherwise). - */ - -std::unique_ptr> write_parquet( - parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream()); +class parquet_writer_options_builder; /** - * @brief Merges multiple raw metadata blobs that were previously created by write_parquet - * into a single metadata blob. - * - * @ingroup io_writers - * - * @param[in] metadata_list List of input file metadata - * @return A parquet-compatible blob that contains the data for all row groups in the list + * @brief Settings for `write_parquet()`. */ -std::unique_ptr> merge_row_group_metadata( - std::vector>> const& metadata_list); - -class chunked_parquet_writer_options_builder; +class parquet_writer_options : public parquet_writer_options_base { + // Sets of columns to output + table_view _table; + // Partitions described as {start_row, num_rows} pairs + std::vector _partitions; + // Column chunks file paths to be set in the raw output metadata. One per output file + std::vector _column_chunks_file_paths; -/** - * @brief Settings for `write_parquet_chunked()`. - */ -class chunked_parquet_writer_options { - // Specify the sink to use for writer output - sink_info _sink; - // Specify the compression format to use - compression_type _compression = compression_type::AUTO; - // Specify the level of statistics in the output file - statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP; - // Optional associated metadata. - std::optional _metadata; - // Optional footer key_value_metadata - std::vector> _user_data; - // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. - // If true then overrides any per-column setting in _metadata. - bool _write_timestamps_as_int96 = false; - // Parquet writer can write timestamps as UTC. Defaults to true. - bool _write_timestamps_as_UTC = true; - // Maximum size of each row group (unless smaller than a single page) - size_t _row_group_size_bytes = default_row_group_size_bytes; - // Maximum number of rows in row group (unless smaller than a single page) - size_type _row_group_size_rows = default_row_group_size_rows; - // Maximum size of each page (uncompressed) - size_t _max_page_size_bytes = default_max_page_size_bytes; - // Maximum number of rows in a page - size_type _max_page_size_rows = default_max_page_size_rows; - // Maximum size of min or max values in column index - int32_t _column_index_truncate_length = default_column_index_truncate_length; - // When to use dictionary encoding for data - dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE; - // Maximum size of column chunk dictionary (in bytes) - size_t _max_dictionary_size = default_max_dictionary_size; - // Maximum number of rows in a page fragment - std::optional _max_page_fragment_size; - // Optional compression statistics - std::shared_ptr _compression_stats; - // write V2 page headers? - bool _v2_page_headers = false; - // Which columns in _table are used for sorting - std::optional> _sorting_columns; + friend parquet_writer_options_builder; /** - * @brief Constructor from sink. + * @brief Constructor from sink and table. * - * @param sink Sink used for writer output + * @param sink The sink used for writer output + * @param table Table to be written to output */ - explicit chunked_parquet_writer_options(sink_info const& sink) : _sink(sink) {} - - friend chunked_parquet_writer_options_builder; + explicit parquet_writer_options(sink_info const& sink, table_view const& table); public: /** @@ -1336,277 +1144,160 @@ class chunked_parquet_writer_options { * * This has been added since Cython requires a default constructor to create objects on stack. */ - chunked_parquet_writer_options() = default; + parquet_writer_options() = default; /** - * @brief Returns sink info. + * @brief Create builder to create `parquet_writer_options`. * - * @return Sink info + * @param sink The sink used for writer output + * @param table Table to be written to output + * + * @return Builder to build parquet_writer_options */ - [[nodiscard]] sink_info const& get_sink() const { return _sink; } + static parquet_writer_options_builder builder(sink_info const& sink, table_view const& table); /** - * @brief Returns compression format used. + * @brief Create builder to create `parquet_writer_options`. * - * @return Compression format + * @return parquet_writer_options_builder */ - [[nodiscard]] compression_type get_compression() const { return _compression; } + static parquet_writer_options_builder builder(); /** - * @brief Returns level of statistics requested in output file. + * @brief Returns table_view. * - * @return Level of statistics requested in output file + * @return Table view */ - [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; } + [[nodiscard]] table_view get_table() const { return _table; } /** - * @brief Returns metadata information. + * @brief Returns partitions. * - * @return Metadata information + * @return Partitions */ - [[nodiscard]] auto const& get_metadata() const { return _metadata; } + [[nodiscard]] std::vector const& get_partitions() const { return _partitions; } /** - * @brief Returns Key-Value footer metadata information. + * @brief Returns Column chunks file paths to be set in the raw output metadata. * - * @return Key-Value footer metadata information + * @return Column chunks file paths to be set in the raw output metadata */ - std::vector> const& get_key_value_metadata() const + [[nodiscard]] std::vector const& get_column_chunks_file_paths() const { - return _user_data; - } - - /** - * @brief Returns `true` if timestamps will be written as INT96 - * - * @return `true` if timestamps will be written as INT96 - */ - bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } - - /** - * @brief Returns `true` if timestamps will be written as UTC - * - * @return `true` if timestamps will be written as UTC - */ - [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } - - /** - * @brief Returns maximum row group size, in bytes. - * - * @return Maximum row group size, in bytes - */ - auto get_row_group_size_bytes() const { return _row_group_size_bytes; } - - /** - * @brief Returns maximum row group size, in rows. - * - * @return Maximum row group size, in rows - */ - auto get_row_group_size_rows() const { return _row_group_size_rows; } - - /** - * @brief Returns maximum uncompressed page size, in bytes. - * - * If set larger than the row group size, then this will return the - * row group size. - * - * @return Maximum uncompressed page size, in bytes - */ - auto get_max_page_size_bytes() const - { - return std::min(_max_page_size_bytes, get_row_group_size_bytes()); - } - - /** - * @brief Returns maximum page size, in rows. - * - * If set larger than the row group size, then this will return the row group size. - * - * @return Maximum page size, in rows - */ - auto get_max_page_size_rows() const - { - return std::min(_max_page_size_rows, get_row_group_size_rows()); - } - - /** - * @brief Returns maximum length of min or max values in column index, in bytes. - * - * @return length min/max will be truncated to - */ - auto get_column_index_truncate_length() const { return _column_index_truncate_length; } - - /** - * @brief Returns policy for dictionary use. - * - * @return policy for dictionary use - */ - [[nodiscard]] dictionary_policy get_dictionary_policy() const { return _dictionary_policy; } - - /** - * @brief Returns maximum dictionary size, in bytes. - * - * @return Maximum dictionary size, in bytes. - */ - [[nodiscard]] auto get_max_dictionary_size() const { return _max_dictionary_size; } - - /** - * @brief Returns maximum page fragment size, in rows. - * - * @return Maximum page fragment size, in rows. - */ - [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; } - - /** - * @brief Returns a shared pointer to the user-provided compression statistics. - * - * @return Compression statistics - */ - [[nodiscard]] std::shared_ptr get_compression_statistics() const - { - return _compression_stats; + return _column_chunks_file_paths; } /** - * @brief Returns `true` if V2 page headers should be written. - * - * @return `true` if V2 page headers should be written. - */ - [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; } - - /** - * @brief Returns the sorting_columns. - * - * @return Column sort order metadata - */ - [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; } - - /** - * @brief Sets metadata. - * - * @param metadata Associated metadata - */ - void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); } - - /** - * @brief Sets Key-Value footer metadata. - * - * @param metadata Key-Value footer metadata - */ - void set_key_value_metadata(std::vector> metadata); - - /** - * @brief Sets the level of statistics in parquet_writer_options. - * - * @param sf Level of statistics requested in the output file - */ - void set_stats_level(statistics_freq sf) { _stats_level = sf; } - - /** - * @brief Sets compression type. - * - * @param compression The compression type to use - */ - void set_compression(compression_type compression) { _compression = compression; } - - /** - * @brief Sets timestamp writing preferences. - * - * INT96 timestamps will be written if `true` and TIMESTAMP_MICROS will be written if `false`. + * @brief Sets partitions. * - * @param req Boolean value to enable/disable writing of INT96 timestamps + * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must + * be same size as number of sinks in sink_info */ - void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; } + void set_partitions(std::vector partitions); /** - * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`. + * @brief Sets column chunks file path to be set in the raw output metadata. * - * @param val Boolean value to enable/disable writing of timestamps as UTC. + * @param file_paths Vector of Strings which indicates file path. Must be same size as number of + * data sinks in sink info */ - void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; } + void set_column_chunks_file_paths(std::vector file_paths); +}; +/** + * @brief Class to build `parquet_writer_options`. + */ +class parquet_writer_options_builder + : public parquet_writer_options_builder_base { + public: /** - * @brief Sets the maximum row group size, in bytes. + * @brief Default constructor. * - * @param size_bytes Maximum row group size, in bytes to set + * This has been added since Cython requires a default constructor to create objects on stack. */ - void set_row_group_size_bytes(size_t size_bytes); + explicit parquet_writer_options_builder() = default; /** - * @brief Sets the maximum row group size, in rows. + * @brief Constructor from sink and table. * - * @param size_rows The maximum row group size, in rows to set + * @param sink The sink used for writer output + * @param table Table to be written to output */ - void set_row_group_size_rows(size_type size_rows); + explicit parquet_writer_options_builder(sink_info const& sink, table_view const& table); /** - * @brief Sets the maximum uncompressed page size, in bytes. + * @brief Sets partitions in parquet_writer_options. * - * @param size_bytes Maximum uncompressed page size, in bytes to set + * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must + * be same size as number of sinks in sink_info + * @return this for chaining */ - void set_max_page_size_bytes(size_t size_bytes); + parquet_writer_options_builder& partitions(std::vector partitions); /** - * @brief Sets the maximum page size, in rows. + * @brief Sets column chunks file path to be set in the raw output metadata. * - * @param size_rows The maximum page size, in rows to set + * @param file_paths Vector of Strings which indicates file path. Must be same size as number of + * data sinks + * @return this for chaining */ - void set_max_page_size_rows(size_type size_rows); + parquet_writer_options_builder& column_chunks_file_paths(std::vector file_paths); +}; - /** - * @brief Sets the maximum length of min or max values in column index, in bytes. - * - * @param size_bytes length min/max will be truncated to - */ - void set_column_index_truncate_length(int32_t size_bytes); +/** + * @brief Writes a set of columns to parquet format. + * + * The following code snippet demonstrates how to write columns to a file: + * @code + * auto destination = cudf::io::sink_info("dataset.parquet"); + * auto options = cudf::io::parquet_writer_options::builder(destination, table->view()); + * cudf::io::write_parquet(options); + * @endcode + * + * @param options Settings for controlling writing behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if + * requested in parquet_writer_options (empty blob otherwise). + */ - /** - * @brief Sets the policy for dictionary use. - * - * @param policy Policy for dictionary use - */ - void set_dictionary_policy(dictionary_policy policy); +std::unique_ptr> write_parquet( + parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream()); - /** - * @brief Sets the maximum dictionary size, in bytes. - * - * @param size_bytes Maximum dictionary size, in bytes - */ - void set_max_dictionary_size(size_t size_bytes); +/** + * @brief Merges multiple raw metadata blobs that were previously created by write_parquet + * into a single metadata blob. + * + * @ingroup io_writers + * + * @param[in] metadata_list List of input file metadata + * @return A parquet-compatible blob that contains the data for all row groups in the list + */ +std::unique_ptr> merge_row_group_metadata( + std::vector>> const& metadata_list); - /** - * @brief Sets the maximum page fragment size, in rows. - * - * @param size_rows Maximum page fragment size, in rows. - */ - void set_max_page_fragment_size(size_type size_rows); +class chunked_parquet_writer_options_builder; +/** + * @brief Settings for `parquet_chunked_writer`. + */ +class chunked_parquet_writer_options : public parquet_writer_options_base { /** - * @brief Sets the pointer to the output compression statistics. + * @brief Constructor from sink. * - * @param comp_stats Pointer to compression statistics to be updated after writing + * @param sink Sink used for writer output */ - void set_compression_statistics(std::shared_ptr comp_stats) - { - _compression_stats = std::move(comp_stats); - } + explicit chunked_parquet_writer_options(sink_info const& sink); - /** - * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`. - * - * @param val Boolean value to enable/disable writing of V2 page headers. - */ - void enable_write_v2_headers(bool val) { _v2_page_headers = val; } + friend chunked_parquet_writer_options_builder; + public: /** - * @brief Sets sorting columns. + * @brief Default constructor. * - * @param sorting_columns Column sort order metadata + * This has been added since Cython requires a default constructor to create objects on stack. */ - void set_sorting_columns(std::vector sorting_columns) - { - _sorting_columns = std::move(sorting_columns); - } + chunked_parquet_writer_options() = default; /** * @brief creates builder to build chunked_parquet_writer_options. @@ -1619,11 +1310,11 @@ class chunked_parquet_writer_options { }; /** - * @brief Builds options for chunked_parquet_writer_options. + * @brief Class to build `chunked_parquet_writer_options`. */ -class chunked_parquet_writer_options_builder { - chunked_parquet_writer_options options; - +class chunked_parquet_writer_options_builder + : public parquet_writer_options_builder_base { public: /** * @brief Default constructor. @@ -1637,238 +1328,7 @@ class chunked_parquet_writer_options_builder { * * @param sink The sink used for writer output */ - chunked_parquet_writer_options_builder(sink_info const& sink) : options(sink){}; - - /** - * @brief Sets metadata to chunked_parquet_writer_options. - * - * @param metadata Associated metadata - * @return this for chaining - */ - chunked_parquet_writer_options_builder& metadata(table_input_metadata metadata) - { - options._metadata = std::move(metadata); - return *this; - } - - /** - * @brief Sets Key-Value footer metadata in parquet_writer_options. - * - * @param metadata Key-Value footer metadata - * @return this for chaining - */ - chunked_parquet_writer_options_builder& key_value_metadata( - std::vector> metadata); - - /** - * @brief Sets the level of statistics in chunked_parquet_writer_options. - * - * @param sf Level of statistics requested in the output file - * @return this for chaining - */ - chunked_parquet_writer_options_builder& stats_level(statistics_freq sf) - { - options._stats_level = sf; - return *this; - } - - /** - * @brief Sets compression type to chunked_parquet_writer_options. - * - * @param compression The compression type to use - * @return this for chaining - */ - chunked_parquet_writer_options_builder& compression(compression_type compression) - { - options._compression = compression; - return *this; - } - - /** - * @brief Set to true if timestamps should be written as - * int96 types instead of int64 types. Even though int96 is deprecated and is - * not an internal type for cudf, it needs to be written for backwards - * compatibility reasons. - * - * @param enabled Boolean value to enable/disable int96 timestamps - * @return this for chaining - */ - chunked_parquet_writer_options_builder& int96_timestamps(bool enabled) - { - options._write_timestamps_as_int96 = enabled; - return *this; - } - - /** - * @brief Set to true if timestamps are to be written as UTC. - * - * @param enabled Boolean value to enable/disable writing of timestamps as UTC. - * @return this for chaining - */ - chunked_parquet_writer_options_builder& utc_timestamps(bool enabled) - { - options._write_timestamps_as_UTC = enabled; - return *this; - } - - /** - * @brief Set to true if V2 page headers are to be written. - * - * @param enabled Boolean value to enable/disable writing of V2 page headers. - * @return this for chaining - */ - chunked_parquet_writer_options_builder& write_v2_headers(bool enabled); - - /** - * @brief Sets the maximum row group size, in bytes. - * - * @param val maximum row group size - * @return this for chaining - */ - chunked_parquet_writer_options_builder& row_group_size_bytes(size_t val) - { - options.set_row_group_size_bytes(val); - return *this; - } - - /** - * @brief Sets the maximum number of rows in output row groups. - * - * @param val maximum number or rows - * @return this for chaining - */ - chunked_parquet_writer_options_builder& row_group_size_rows(size_type val) - { - options.set_row_group_size_rows(val); - return *this; - } - - /** - * @brief Sets the maximum uncompressed page size, in bytes. - * - * Serves as a hint to the writer, and can be exceeded under certain circumstances. Cannot be - * larger than the row group size in bytes, and will be adjusted to match if it is. - * - * @param val maximum page size - * @return this for chaining - */ - chunked_parquet_writer_options_builder& max_page_size_bytes(size_t val) - { - options.set_max_page_size_bytes(val); - return *this; - } - - /** - * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. - * Cannot be larger than the row group size in rows, and will be adjusted to match if it is. - * - * @param val maximum rows per page - * @return this for chaining - */ - chunked_parquet_writer_options_builder& max_page_size_rows(size_type val) - { - options.set_max_page_size_rows(val); - return *this; - } - - /** - * @brief Sets the desired maximum size in bytes for min and max values in the column index. - * - * Values exceeding this limit will be truncated, but modified such that they will still - * be valid lower and upper bounds. This only applies to variable length types, such as string. - * Maximum values will not be truncated if there is no suitable truncation that results in - * a valid upper bound. - * - * Default value is 64. - * - * @param val length min/max will be truncated to, with 0 indicating no truncation - * @return this for chaining - */ - chunked_parquet_writer_options_builder& column_index_truncate_length(int32_t val) - { - options.set_column_index_truncate_length(val); - return *this; - } - - /** - * @brief Sets the policy for dictionary use. - * - * Certain compression algorithms (e.g Zstandard) have limits on how large of a buffer can - * be compressed. In some circumstances, the dictionary can grow beyond this limit, which - * will prevent the column from being compressed. This setting controls how the writer - * should act in these circumstances. A setting of dictionary_policy::ADAPTIVE will disable - * dictionary encoding for columns where the dictionary exceeds the limit. A setting of - * dictionary_policy::NEVER will disable the use of dictionary encoding globally. A setting of - * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in - * the disabling of compression for columns that would otherwise be compressed. - * - * The default value is dictionary_policy::ADAPTIVE. - * - * @param val policy for dictionary use - * @return this for chaining - */ - chunked_parquet_writer_options_builder& dictionary_policy(enum dictionary_policy val); - - /** - * @brief Sets the maximum dictionary size, in bytes. - * - * Disables dictionary encoding for any column chunk where the dictionary will - * exceed this limit. Only used when the dictionary_policy is set to 'ADAPTIVE'. - * - * Default value is 1048576 (1MiB). - * - * @param val maximum dictionary size - * @return this for chaining - */ - chunked_parquet_writer_options_builder& max_dictionary_size(size_t val); - - /** - * @brief Sets the maximum page fragment size, in rows. - * - * Files with nested schemas or very long strings may need a page fragment size - * smaller than the default value of 5000 to ensure a single fragment will not - * exceed the desired maximum page size in bytes. - * - * @param val maximum page fragment size - * @return this for chaining - */ - chunked_parquet_writer_options_builder& max_page_fragment_size(size_type val); - - /** - * @brief Sets the pointer to the output compression statistics. - * - * @param comp_stats Pointer to compression statistics to be filled once writer is done - * @return this for chaining - */ - chunked_parquet_writer_options_builder& compression_statistics( - std::shared_ptr const& comp_stats) - { - options._compression_stats = comp_stats; - return *this; - } - - /** - * @brief Sets column sorting metadata to chunked_parquet_writer_options. - * - * @param sorting_columns Column sort order metadata - * @return this for chaining - */ - chunked_parquet_writer_options_builder& sorting_columns( - std::vector sorting_columns); - - /** - * @brief move chunked_parquet_writer_options member once it's built. - */ - operator chunked_parquet_writer_options&&() { return std::move(options); } - - /** - * @brief move chunked_parquet_writer_options member once it's is built. - * - * This has been added since Cython does not support overloading of conversion operators. - * - * @return Built `chunked_parquet_writer_options` object's r-value reference - */ - chunked_parquet_writer_options&& build() { return std::move(options); } + chunked_parquet_writer_options_builder(sink_info const& sink); }; /** diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 3ba2facf276..1ed8ee5ce06 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -115,7 +115,7 @@ parquet_writer_options_builder parquet_writer_options::builder() chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( sink_info const& sink) { - return chunked_parquet_writer_options_builder(sink); + return chunked_parquet_writer_options_builder{sink}; } namespace { @@ -740,29 +740,37 @@ void parquet_reader_options::set_num_rows(size_type val) _num_rows = val; } -void parquet_writer_options::set_partitions(std::vector partitions) +void parquet_writer_options_base::set_metadata(table_input_metadata metadata) { - CUDF_EXPECTS(partitions.size() == _sink.num_sinks(), - "Mismatch between number of sinks and number of partitions"); - _partitions = std::move(partitions); + _metadata = std::move(metadata); } -void parquet_writer_options::set_key_value_metadata( +void parquet_writer_options_base::set_key_value_metadata( std::vector> metadata) { - CUDF_EXPECTS(metadata.size() == _sink.num_sinks(), + CUDF_EXPECTS(metadata.size() == get_sink().num_sinks(), "Mismatch between number of sinks and number of metadata maps"); _user_data = std::move(metadata); } -void parquet_writer_options::set_column_chunks_file_paths(std::vector file_paths) +void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_level = sf; } + +void parquet_writer_options_base::set_compression(compression_type compression) { - CUDF_EXPECTS(file_paths.size() == _sink.num_sinks(), - "Mismatch between number of sinks and number of chunk paths to set"); - _column_chunks_file_paths = std::move(file_paths); + _compression = compression; +} + +void parquet_writer_options_base::enable_int96_timestamps(bool req) +{ + _write_timestamps_as_int96 = req; +} + +void parquet_writer_options_base::enable_utc_timestamps(bool val) +{ + _write_timestamps_as_UTC = val; } -void parquet_writer_options::set_row_group_size_bytes(size_t size_bytes) +void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes) { CUDF_EXPECTS( size_bytes >= 1024, @@ -770,13 +778,13 @@ void parquet_writer_options::set_row_group_size_bytes(size_t size_bytes) _row_group_size_bytes = size_bytes; } -void parquet_writer_options::set_row_group_size_rows(size_type size_rows) +void parquet_writer_options_base::set_row_group_size_rows(size_type size_rows) { CUDF_EXPECTS(size_rows > 0, "The maximum row group row count must be a positive integer."); _row_group_size_rows = size_rows; } -void parquet_writer_options::set_max_page_size_bytes(size_t size_bytes) +void parquet_writer_options_base::set_max_page_size_bytes(size_t size_bytes) { CUDF_EXPECTS(size_bytes >= 1024, "The maximum page size cannot be smaller than 1KB."); CUDF_EXPECTS(size_bytes <= static_cast(std::numeric_limits::max()), @@ -784,190 +792,249 @@ void parquet_writer_options::set_max_page_size_bytes(size_t size_bytes) _max_page_size_bytes = size_bytes; } -void parquet_writer_options::set_max_page_size_rows(size_type size_rows) +void parquet_writer_options_base::set_max_page_size_rows(size_type size_rows) { CUDF_EXPECTS(size_rows > 0, "The maximum page row count must be a positive integer."); _max_page_size_rows = size_rows; } -void parquet_writer_options::set_column_index_truncate_length(int32_t size_bytes) +void parquet_writer_options_base::set_column_index_truncate_length(int32_t size_bytes) { CUDF_EXPECTS(size_bytes >= 0, "Column index truncate length cannot be negative."); _column_index_truncate_length = size_bytes; } -void parquet_writer_options::set_dictionary_policy(dictionary_policy policy) +void parquet_writer_options_base::set_dictionary_policy(dictionary_policy policy) { _dictionary_policy = policy; } -void parquet_writer_options::set_max_dictionary_size(size_t size_bytes) +void parquet_writer_options_base::set_max_dictionary_size(size_t size_bytes) { CUDF_EXPECTS(size_bytes <= static_cast(std::numeric_limits::max()), "The maximum dictionary size cannot exceed 2GB."); _max_dictionary_size = size_bytes; } -void parquet_writer_options::set_max_page_fragment_size(size_type size_rows) +void parquet_writer_options_base::set_max_page_fragment_size(size_type size_rows) { CUDF_EXPECTS(size_rows > 0, "Page fragment size must be a positive integer."); _max_page_fragment_size = size_rows; } -parquet_writer_options_builder& parquet_writer_options_builder::partitions( - std::vector partitions) +void parquet_writer_options_base::set_compression_statistics( + std::shared_ptr comp_stats) { - options.set_partitions(std::move(partitions)); - return *this; + _compression_stats = std::move(comp_stats); +} + +void parquet_writer_options_base::enable_write_v2_headers(bool val) { _v2_page_headers = val; } + +void parquet_writer_options_base::set_sorting_columns(std::vector sorting_columns) +{ + _sorting_columns = std::move(sorting_columns); +} + +parquet_writer_options::parquet_writer_options(sink_info const& sink, table_view const& table) + : parquet_writer_options_base(sink), _table(table) +{ +} + +void parquet_writer_options::set_partitions(std::vector partitions) +{ + CUDF_EXPECTS(partitions.size() == get_sink().num_sinks(), + "Mismatch between number of sinks and number of partitions"); + _partitions = std::move(partitions); +} + +void parquet_writer_options::set_column_chunks_file_paths(std::vector file_paths) +{ + CUDF_EXPECTS(file_paths.size() == get_sink().num_sinks(), + "Mismatch between number of sinks and number of chunk paths to set"); + _column_chunks_file_paths = std::move(file_paths); +} + +template +parquet_writer_options_builder_base::parquet_writer_options_builder_base( + OptionsT options) + : _options(std::move(options)) +{ +} + +template +BuilderT& parquet_writer_options_builder_base::metadata( + table_input_metadata metadata) +{ + _options.set_metadata(std::move(metadata)); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::key_value_metadata( +template +BuilderT& parquet_writer_options_builder_base::key_value_metadata( std::vector> metadata) { - options.set_key_value_metadata(std::move(metadata)); - return *this; + _options.set_key_value_metadata(std::move(metadata)); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::column_chunks_file_paths( - std::vector file_paths) +template +BuilderT& parquet_writer_options_builder_base::stats_level(statistics_freq sf) { - options.set_column_chunks_file_paths(std::move(file_paths)); - return *this; + _options.set_stats_level(sf); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::dictionary_policy( - enum dictionary_policy val) +template +BuilderT& parquet_writer_options_builder_base::compression( + compression_type compression) { - options.set_dictionary_policy(val); - return *this; + _options.set_compression(compression); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::max_dictionary_size(size_t val) +template +BuilderT& parquet_writer_options_builder_base::row_group_size_bytes(size_t val) { - options.set_max_dictionary_size(val); - return *this; + _options.set_row_group_size_bytes(val); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::max_page_fragment_size( +template +BuilderT& parquet_writer_options_builder_base::row_group_size_rows( size_type val) { - options.set_max_page_fragment_size(val); - return *this; + _options.set_row_group_size_rows(val); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::write_v2_headers(bool enabled) +template +BuilderT& parquet_writer_options_builder_base::max_page_size_bytes(size_t val) { - options.enable_write_v2_headers(enabled); - return *this; + _options.set_max_page_size_bytes(val); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::sorting_columns( - std::vector sorting_columns) +template +BuilderT& parquet_writer_options_builder_base::max_page_size_rows(size_type val) { - options._sorting_columns = std::move(sorting_columns); - return *this; + _options.set_max_page_size_rows(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_key_value_metadata( - std::vector> metadata) +template +BuilderT& parquet_writer_options_builder_base::column_index_truncate_length( + int32_t val) { - CUDF_EXPECTS(metadata.size() == _sink.num_sinks(), - "Mismatch between number of sinks and number of metadata maps"); - _user_data = std::move(metadata); + _options.set_column_index_truncate_length(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_row_group_size_bytes(size_t size_bytes) +template +BuilderT& parquet_writer_options_builder_base::dictionary_policy( + enum dictionary_policy val) { - CUDF_EXPECTS( - size_bytes >= 1024, - "The maximum row group size cannot be smaller than the minimum page size, which is 1KB."); - _row_group_size_bytes = size_bytes; + _options.set_dictionary_policy(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_row_group_size_rows(size_type size_rows) +template +BuilderT& parquet_writer_options_builder_base::max_dictionary_size(size_t val) { - CUDF_EXPECTS(size_rows > 0, "The maximum row group row count must be a positive integer."); - _row_group_size_rows = size_rows; + _options.set_max_dictionary_size(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_max_page_size_bytes(size_t size_bytes) +template +BuilderT& parquet_writer_options_builder_base::max_page_fragment_size( + size_type val) { - CUDF_EXPECTS(size_bytes >= 1024, "The maximum page size cannot be smaller than 1KB."); - CUDF_EXPECTS(size_bytes <= static_cast(std::numeric_limits::max()), - "The maximum page size cannot exceed 2GB."); - _max_page_size_bytes = size_bytes; + _options.set_max_page_fragment_size(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_max_page_size_rows(size_type size_rows) +template +BuilderT& parquet_writer_options_builder_base::compression_statistics( + std::shared_ptr const& comp_stats) { - CUDF_EXPECTS(size_rows > 0, "The maximum page row count must be a positive integer."); - _max_page_size_rows = size_rows; + _options.set_compression_statistics(comp_stats); + return static_cast(*this); } -void chunked_parquet_writer_options::set_column_index_truncate_length(int32_t size_bytes) +template +BuilderT& parquet_writer_options_builder_base::int96_timestamps(bool enabled) { - CUDF_EXPECTS(size_bytes >= 0, "Column index truncate length cannot be negative."); - _column_index_truncate_length = size_bytes; + _options.enable_int96_timestamps(enabled); + return static_cast(*this); } -void chunked_parquet_writer_options::set_dictionary_policy(dictionary_policy policy) +template +BuilderT& parquet_writer_options_builder_base::utc_timestamps(bool enabled) { - _dictionary_policy = policy; + _options.enable_utc_timestamps(enabled); + return static_cast(*this); } -void chunked_parquet_writer_options::set_max_dictionary_size(size_t size_bytes) +template +BuilderT& parquet_writer_options_builder_base::write_v2_headers(bool enabled) { - CUDF_EXPECTS(size_bytes <= static_cast(std::numeric_limits::max()), - "The maximum dictionary size cannot exceed 2GB."); - _max_dictionary_size = size_bytes; + _options.enable_write_v2_headers(enabled); + return static_cast(*this); } -void chunked_parquet_writer_options::set_max_page_fragment_size(size_type size_rows) +template +BuilderT& parquet_writer_options_builder_base::sorting_columns( + std::vector sorting_columns) { - CUDF_EXPECTS(size_rows > 0, "Page fragment size must be a positive integer."); - _max_page_fragment_size = size_rows; + _options.set_sorting_columns(std::move(sorting_columns)); + return static_cast(*this); } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::key_value_metadata( - std::vector> metadata) +template +parquet_writer_options_builder_base::operator OptionsT&&() { - options.set_key_value_metadata(std::move(metadata)); - return *this; + return std::move(_options); } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::dictionary_policy( - enum dictionary_policy val) +template +OptionsT&& parquet_writer_options_builder_base::build() { - options.set_dictionary_policy(val); - return *this; + return std::move(_options); } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::max_dictionary_size( - size_t val) +template class parquet_writer_options_builder_base; +template class parquet_writer_options_builder_base; + +parquet_writer_options_builder::parquet_writer_options_builder(sink_info const& sink, + table_view const& table) + : parquet_writer_options_builder_base(parquet_writer_options{sink, table}) { - options.set_max_dictionary_size(val); - return *this; } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::write_v2_headers( - bool enabled) +parquet_writer_options_builder& parquet_writer_options_builder::partitions( + std::vector partitions) { - options.enable_write_v2_headers(enabled); + get_options().set_partitions(std::move(partitions)); return *this; } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::sorting_columns( - std::vector sorting_columns) +parquet_writer_options_builder& parquet_writer_options_builder::column_chunks_file_paths( + std::vector file_paths) { - options._sorting_columns = std::move(sorting_columns); + get_options().set_column_chunks_file_paths(std::move(file_paths)); return *this; } -chunked_parquet_writer_options_builder& -chunked_parquet_writer_options_builder::max_page_fragment_size(size_type val) +chunked_parquet_writer_options::chunked_parquet_writer_options(sink_info const& sink) + : parquet_writer_options_base(sink) +{ +} + +chunked_parquet_writer_options_builder::chunked_parquet_writer_options_builder( + sink_info const& sink) + : parquet_writer_options_builder_base(chunked_parquet_writer_options{sink}) { - options.set_max_page_fragment_size(val); - return *this; } } // namespace cudf::io diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd index fb98650308a..36654457995 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd @@ -66,24 +66,19 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cdef cudf_io_types.table_with_metadata read_parquet( parquet_reader_options args) except + - cdef cppclass parquet_writer_options: - parquet_writer_options() except + + cdef cppclass parquet_writer_options_base: + parquet_writer_options_base() except + cudf_io_types.sink_info get_sink_info() except + cudf_io_types.compression_type get_compression() except + cudf_io_types.statistics_freq get_stats_level() except + - cudf_table_view.table_view get_table() except + const optional[cudf_io_types.table_input_metadata]& get_metadata( ) except + - string get_column_chunks_file_paths() except + size_t get_row_group_size_bytes() except + size_type get_row_group_size_rows() except + size_t get_max_page_size_bytes() except + size_type get_max_page_size_rows() except + size_t get_max_dictionary_size() except + - void set_partitions( - vector[cudf_io_types.partition_info] partitions - ) except + void set_metadata( cudf_io_types.table_input_metadata m ) except + @@ -96,9 +91,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_compression( cudf_io_types.compression_type compression ) except + - void set_column_chunks_file_paths( - vector[string] column_chunks_file_paths - ) except + void set_int96_timestamps( bool enabled ) except + @@ -113,161 +105,112 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void enable_write_v2_headers(bool val) except + void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except + + cdef cppclass parquet_writer_options(parquet_writer_options_base): + parquet_writer_options() except + + cudf_table_view.table_view get_table() except + + string get_column_chunks_file_paths() except + + void set_partitions( + vector[cudf_io_types.partition_info] partitions + ) except + + void set_column_chunks_file_paths( + vector[string] column_chunks_file_paths + ) except + + @staticmethod parquet_writer_options_builder builder( cudf_io_types.sink_info sink_, cudf_table_view.table_view table_ ) except + - cdef cppclass parquet_writer_options_builder: - + cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]: parquet_writer_options_builder() except + - parquet_writer_options_builder( - cudf_io_types.sink_info sink_, - cudf_table_view.table_view table_ - ) except + - parquet_writer_options_builder& partitions( - vector[cudf_io_types.partition_info] partitions - ) except + - parquet_writer_options_builder& metadata( + + BuilderT& metadata( cudf_io_types.table_input_metadata m ) except + - parquet_writer_options_builder& key_value_metadata( + BuilderT& key_value_metadata( vector[map[string, string]] kvm ) except + - parquet_writer_options_builder& stats_level( + BuilderT& stats_level( cudf_io_types.statistics_freq sf ) except + - parquet_writer_options_builder& compression( + BuilderT& compression( cudf_io_types.compression_type compression ) except + - parquet_writer_options_builder& column_chunks_file_paths( - vector[string] column_chunks_file_paths - ) except + - parquet_writer_options_builder& int96_timestamps( + BuilderT& int96_timestamps( bool enabled ) except + - parquet_writer_options_builder& utc_timestamps( + BuilderT& utc_timestamps( bool enabled ) except + - parquet_writer_options_builder& row_group_size_bytes( + BuilderT& row_group_size_bytes( size_t val ) except + - parquet_writer_options_builder& row_group_size_rows( + BuilderT& row_group_size_rows( size_type val ) except + - parquet_writer_options_builder& max_page_size_bytes( + BuilderT& max_page_size_bytes( size_t val ) except + - parquet_writer_options_builder& max_page_size_rows( + BuilderT& max_page_size_rows( size_type val ) except + - parquet_writer_options_builder& max_dictionary_size( + BuilderT& max_dictionary_size( size_t val ) except + - parquet_writer_options_builder& write_v2_headers( + BuilderT& write_v2_headers( bool val ) except + - parquet_writer_options_builder& dictionary_policy( + BuilderT& dictionary_policy( cudf_io_types.dictionary_policy val ) except + + # FIXME: the following two functions actually belong in + # parquet_writer_options_builder, but placing them there yields a + # "'parquet_writer_options_builder' is not a type identifier" error. + # This is probably a bug in cython since a simpler CRTP example that + # has methods returning references to a child class seem to work. + # Calling these from the chunked options builder will fail at compile + # time, so this should be safe. + # NOTE: these two are never actually called from libcudf. Instead these + # properties are set in the options after calling build(), so perhaps + # they can be removed. + BuilderT& partitions( + vector[cudf_io_types.partition_info] partitions + ) except + + BuilderT& column_chunks_file_paths( + vector[string] column_chunks_file_paths + ) except + + OptionsT build() except + - parquet_writer_options build() except + + cdef cppclass parquet_writer_options_builder( + parquet_writer_options_builder_base[parquet_writer_options_builder, + parquet_writer_options]): + parquet_writer_options_builder() except + + parquet_writer_options_builder( + cudf_io_types.sink_info sink_, + cudf_table_view.table_view table_ + ) except + cdef unique_ptr[vector[uint8_t]] write_parquet( parquet_writer_options args ) except + - cdef cppclass chunked_parquet_writer_options: + cdef cppclass chunked_parquet_writer_options(parquet_writer_options_base): chunked_parquet_writer_options() except + - cudf_io_types.sink_info get_sink() except + - cudf_io_types.compression_type get_compression() except + - cudf_io_types.statistics_freq get_stats_level() except + - const optional[cudf_io_types.table_input_metadata]& get_metadata( - ) except + - size_t get_row_group_size_bytes() except + - size_type get_row_group_size_rows() except + - size_t get_max_page_size_bytes() except + - size_type get_max_page_size_rows() except + - size_t get_max_dictionary_size() except + - - void set_metadata( - cudf_io_types.table_input_metadata m - ) except + - void set_key_value_metadata( - vector[map[string, string]] kvm - ) except + - void set_stats_level( - cudf_io_types.statistics_freq sf - ) except + - void set_compression( - cudf_io_types.compression_type compression - ) except + - void set_int96_timestamps( - bool enabled - ) except + - void set_utc_timestamps( - bool enabled - ) except + - void set_row_group_size_bytes(size_t val) except + - void set_row_group_size_rows(size_type val) except + - void set_max_page_size_bytes(size_t val) except + - void set_max_page_size_rows(size_type val) except + - void set_max_dictionary_size(size_t val) except + - void enable_write_v2_headers(bool val) except + - void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except + @staticmethod chunked_parquet_writer_options_builder builder( cudf_io_types.sink_info sink_, ) except + - cdef cppclass chunked_parquet_writer_options_builder: + cdef cppclass chunked_parquet_writer_options_builder( + parquet_writer_options_builder_base[chunked_parquet_writer_options_builder, + chunked_parquet_writer_options] + ): chunked_parquet_writer_options_builder() except + chunked_parquet_writer_options_builder( cudf_io_types.sink_info sink_, ) except + - chunked_parquet_writer_options_builder& metadata( - cudf_io_types.table_input_metadata m - ) except + - chunked_parquet_writer_options_builder& key_value_metadata( - vector[map[string, string]] kvm - ) except + - chunked_parquet_writer_options_builder& stats_level( - cudf_io_types.statistics_freq sf - ) except + - chunked_parquet_writer_options_builder& compression( - cudf_io_types.compression_type compression - ) except + - chunked_parquet_writer_options_builder& int96_timestamps( - bool enabled - ) except + - chunked_parquet_writer_options_builder& utc_timestamps( - bool enabled - ) except + - chunked_parquet_writer_options_builder& row_group_size_bytes( - size_t val - ) except + - chunked_parquet_writer_options_builder& row_group_size_rows( - size_type val - ) except + - chunked_parquet_writer_options_builder& max_page_size_bytes( - size_t val - ) except + - chunked_parquet_writer_options_builder& max_page_size_rows( - size_type val - ) except + - chunked_parquet_writer_options_builder& max_dictionary_size( - size_t val - ) except + - parquet_writer_options_builder& write_v2_headers( - bool val - ) except + - parquet_writer_options_builder& dictionary_policy( - cudf_io_types.dictionary_policy val - ) except + - - chunked_parquet_writer_options build() except + cdef cppclass parquet_chunked_writer: parquet_chunked_writer() except + From ae12634c834a82d3d8884110c9de07d91877c828 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 10 Jun 2024 09:51:28 -0400 Subject: [PATCH 068/340] Fix large strings handling in nvtext::character_tokenize (#15829) Fix logic for `nvtext::character_tokenize` to handle large strings input. The output for > 2GB input strings column will turn characters into rows and so will likely overflow the `size_type` rows as expected. The `thrust::count_if` is replaced with a raw kernel to produce the appropriate count that can be checked against max row size. Also changed the API to not accept null rows since the code does not check for them and can return invalid results for inputs with unsanitized-null rows. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15829 --- cpp/benchmarks/text/tokenize.cpp | 6 +- cpp/include/nvtext/tokenize.hpp | 3 +- cpp/src/text/tokenize.cu | 66 ++++++++++++++----- cpp/tests/text/tokenize_tests.cpp | 10 +-- python/cudf/cudf/core/column/string.py | 13 ++-- .../cudf/cudf/tests/text/test_text_methods.py | 2 - 6 files changed, 66 insertions(+), 34 deletions(-) diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 2151b28d637..e83310e0343 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -39,8 +39,10 @@ static void bench_tokenize(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp index ea1b9c716f0..29fed0759c7 100644 --- a/cpp/include/nvtext/tokenize.hpp +++ b/cpp/include/nvtext/tokenize.hpp @@ -176,7 +176,8 @@ std::unique_ptr count_tokens( * t is now ["h","e","l","l","o"," ","w","o","r","l","d","g","o","o","d","b","y","e"] * @endcode * - * All null row entries are ignored and the output contains all valid rows. + * @throw std::invalid_argument if `input` contains nulls + * @throw std::overflow_error if the output would produce more than max size_type rows * * @param input Strings column to tokenize * @param stream CUDA stream used for device memory operations and kernel launches diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 0b16305a81a..25406bce759 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include +#include #include #include #include @@ -99,6 +101,31 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr); } +constexpr int64_t block_size = 512; // number of threads per block +constexpr int64_t bytes_per_thread = 4; // bytes processed per thread + +CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, int64_t* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const byte_idx = static_cast(idx) * bytes_per_thread; + auto const lane_idx = static_cast(threadIdx.x); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + int64_t count = 0; + // each thread processes multiple bytes + for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) { + count += cudf::strings::detail::is_begin_utf8_char(d_chars[i]); + } + auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum()); + + if ((lane_idx == 0) && (total > 0)) { + cuda::atomic_ref ref{*d_output}; + ref.fetch_add(total, cuda::std::memory_order_relaxed); + } +} + } // namespace // detail APIs @@ -176,11 +203,17 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } - auto offsets = strings_column.offsets(); - auto offset = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); - auto chars_bytes = cudf::strings::detail::get_offset_value( - offsets, strings_column.offset() + strings_count, stream) - - offset; + CUDF_EXPECTS( + strings_column.null_count() == 0, "input must not contain nulls", std::invalid_argument); + + auto const offsets = strings_column.offsets(); + auto const offset = + cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); + auto const chars_bytes = cudf::strings::detail::get_offset_value( + offsets, strings_column.offset() + strings_count, stream) - + offset; + // no bytes -- this could happen in an all-empty column + if (chars_bytes == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } auto d_chars = strings_column.parent().data(); // unsigned is necessary for checking bits d_chars += offset; @@ -188,23 +221,26 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // To minimize memory, count the number of characters so we can // build the output offsets without an intermediate buffer. // In the worst case each byte is a character so the output is 4x the input. - cudf::size_type num_characters = thrust::count_if( - rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) { - return cudf::strings::detail::is_begin_utf8_char(byte); - }); + rmm::device_scalar d_count(0, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + cudf::util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), + block_size); + count_characters<<>>( + d_chars, chars_bytes, d_count.data()); + auto const num_characters = d_count.value(stream); - // no characters check -- this could happen in all-empty or all-null strings column - if (num_characters == 0) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - } + // number of characters becomes the number of rows so need to check the row limit + CUDF_EXPECTS( + num_characters + 1 < static_cast(std::numeric_limits::max()), + "output exceeds the column size limit", + std::overflow_error); // create output offsets column - // -- conditionally copy a counting iterator where - // the first byte of each character is located auto offsets_column = cudf::make_numeric_column( offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_new_offsets = cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view()); + // offsets are at the beginning byte of each character cudf::detail::copy_if_safe( thrust::counting_iterator(0), thrust::counting_iterator(chars_bytes + 1), diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp index 6a6bcda87cc..a59a54169d7 100644 --- a/cpp/tests/text/tokenize_tests.cpp +++ b/cpp/tests/text/tokenize_tests.cpp @@ -111,17 +111,13 @@ TEST_F(TextTokenizeTest, TokenizeErrorTest) TEST_F(TextTokenizeTest, CharacterTokenize) { - std::vector h_strings{"the mousé ate the cheese", nullptr, ""}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + cudf::test::strings_column_wrapper input({"the mousé ate the cheese", ""}); cudf::test::strings_column_wrapper expected{"t", "h", "e", " ", "m", "o", "u", "s", "é", " ", "a", "t", "e", " ", "t", "h", "e", " ", "c", "h", "e", "e", "s", "e"}; - auto results = nvtext::character_tokenize(cudf::strings_column_view(strings)); + auto results = nvtext::character_tokenize(cudf::strings_column_view(input)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -151,8 +147,6 @@ TEST_F(TextTokenizeTest, TokenizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::character_tokenize(all_empty); EXPECT_EQ(results->size(), 0); - results = nvtext::character_tokenize(all_null); - EXPECT_EQ(results->size(), 0); auto const delimiter = cudf::string_scalar{""}; results = nvtext::tokenize_with_vocabulary(view, all_empty, delimiter); EXPECT_EQ(results->size(), 0); diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d12aa80e9a3..ad7dbe5e52e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -552,16 +552,17 @@ def join( return self._return_or_inplace(data) def _split_by_character(self): - result_col = libstrings.character_tokenize(self._column) + col = self._column.fillna("") # sanitize nulls + result_col = libstrings.character_tokenize(col) - offset_col = self._column.children[0] + offset_col = col.children[0] return cudf.core.column.ListColumn( - size=len(self._column), - dtype=cudf.ListDtype(self._column.dtype), - mask=self._column.mask, + size=len(col), + dtype=cudf.ListDtype(col.dtype), + mask=col.mask, offset=0, - null_count=self._column.null_count, + null_count=0, children=(offset_col, result_col), ) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 6bd3b99bae1..36f7f3de828 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -426,7 +426,6 @@ def test_character_tokenize_series(): [ "hello world", "sdf", - None, ( "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" @@ -543,7 +542,6 @@ def test_character_tokenize_index(): [ "hello world", "sdf", - None, ( "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" From 9b2c35f346b91b598238cbf54e40a463820708c0 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 10 Jun 2024 11:40:08 -0500 Subject: [PATCH 069/340] Support arbitrary CUDA versions in UDF code (#15950) This PR eliminates the manual mapping from PTX versions to CUDA versions, to help support CUDA 12.5 and newer without requiring a manual update to `_numba.py` for every CUDA release. This also updates the minimum compute capability PTX file from arch 60 to arch 70, since that is now the minimum required by RAPIDS. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Graham Markall (https://github.com/gmarkall) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/15950 --- .../_lib/pylibcudf/libcudf/strings_udf.pxd | 1 + python/cudf/cudf/_lib/strings_udf.pyx | 5 ++ python/cudf/cudf/utils/_numba.py | 84 +++---------------- python/cudf/udf_cpp/CMakeLists.txt | 2 +- .../include/cudf/strings/udf/udf_apis.hpp | 9 +- .../strings/src/strings/udf/udf_apis.cu | 2 + 6 files changed, 30 insertions(+), 73 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd index b895d5e6925..804ad30dfb1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd @@ -18,6 +18,7 @@ cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \ cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ "cudf::strings::udf" nogil: + cdef int get_cuda_build_version() except + cdef unique_ptr[device_buffer] to_string_view_array(column_view) except + cdef unique_ptr[column] column_from_udf_string_array( udf_string* strings, size_type size, diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index e952492c45d..7610cad0b40 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -22,11 +22,16 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view from cudf._lib.pylibcudf.libcudf.strings_udf cimport ( column_from_udf_string_array as cpp_column_from_udf_string_array, free_udf_string_array as cpp_free_udf_string_array, + get_cuda_build_version as cpp_get_cuda_build_version, to_string_view_array as cpp_to_string_view_array, udf_string, ) +def get_cuda_build_version(): + return cpp_get_cuda_build_version() + + def column_to_string_view_array(Column strings_col): cdef unique_ptr[device_buffer] c_buffer cdef column_view input_view = strings_col.view() diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py index 494b48b3cfd..d9dde58d998 100644 --- a/python/cudf/cudf/utils/_numba.py +++ b/python/cudf/cudf/utils/_numba.py @@ -12,16 +12,14 @@ # strings_udf. This is the easiest way to break an otherwise circular import # loop of _lib.*->cudautils->_numba->_lib.strings_udf @lru_cache -def _get_cc_60_ptx_file(): +def _get_cuda_build_version(): from cudf._lib import strings_udf - return os.path.join( - os.path.dirname(strings_udf.__file__), - "..", - "core", - "udf", - "shim_60.ptx", - ) + # The version is an integer, parsed as 1000 * major + 10 * minor + cuda_build_version = strings_udf.get_cuda_build_version() + cuda_major_version = cuda_build_version // 1000 + cuda_minor_version = (cuda_build_version % 1000) // 10 + return (cuda_major_version, cuda_minor_version) def _get_best_ptx_file(archs, max_compute_capability): @@ -38,8 +36,8 @@ def _get_best_ptx_file(archs, max_compute_capability): def _get_ptx_file(path, prefix): if "RAPIDS_NO_INITIALIZE" in os.environ: - # cc=60 ptx is always built - cc = int(os.environ.get("STRINGS_UDF_CC", "60")) + # cc=70 ptx is always built + cc = int(os.environ.get("STRINGS_UDF_CC", "70")) else: from numba import cuda @@ -120,15 +118,13 @@ def _setup_numba(): versions = safe_get_versions() if versions != NO_DRIVER: driver_version, runtime_version = versions - ptx_toolkit_version = _get_cuda_version_from_ptx_file( - _get_cc_60_ptx_file() - ) + shim_ptx_cuda_version = _get_cuda_build_version() # MVC is required whenever any PTX is newer than the driver - # This could be the shipped PTX file or the PTX emitted by - # the version of NVVM on the user system, the latter aligning - # with the runtime version - if (driver_version < ptx_toolkit_version) or ( + # This could be the shipped shim PTX file (determined by the CUDA + # version used at build time) or the PTX emitted by the version of NVVM + # on the user system (determined by the user's CUDA runtime version) + if (driver_version < shim_ptx_cuda_version) or ( driver_version < runtime_version ): if driver_version < (12, 0): @@ -139,60 +135,6 @@ def _setup_numba(): patch_numba_linker() -def _get_cuda_version_from_ptx_file(path): - """ - https://docs.nvidia.com/cuda/parallel-thread-execution/ - Each PTX module must begin with a .version - directive specifying the PTX language version - - example header: - // - // Generated by NVIDIA NVVM Compiler - // - // Compiler Build ID: CL-31057947 - // Cuda compilation tools, release 11.6, V11.6.124 - // Based on NVVM 7.0.1 - // - - .version 7.6 - .target sm_52 - .address_size 64 - - """ - with open(path) as ptx_file: - for line in ptx_file: - if line.startswith(".version"): - ver_line = line - break - else: - raise ValueError("Could not read CUDA version from ptx file.") - version = ver_line.strip("\n").split(" ")[1] - # This dictionary maps from supported versions of NVVM to the - # PTX version it produces. The lowest value should be the minimum - # CUDA version required to compile the library. Currently CUDA 11.5 - # or higher is required to build cudf. New CUDA versions should - # be added to this dictionary when officially supported. - ver_map = { - "7.5": (11, 5), - "7.6": (11, 6), - "7.7": (11, 7), - "7.8": (11, 8), - "8.0": (12, 0), - "8.1": (12, 1), - "8.2": (12, 2), - "8.3": (12, 3), - "8.4": (12, 4), - } - - cuda_ver = ver_map.get(version) - if cuda_ver is None: - raise ValueError( - f"Could not map PTX version {version} to a CUDA version" - ) - - return cuda_ver - - class _CUDFNumbaConfig: def __enter__(self): self.CUDA_LOW_OCCUPANCY_WARNINGS = ( diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt index fe7f9d0b00d..fa7855cfc65 100644 --- a/python/cudf/udf_cpp/CMakeLists.txt +++ b/python/cudf/udf_cpp/CMakeLists.txt @@ -60,7 +60,7 @@ set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true) # always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be # safely queried through a context -list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "60") +list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "70") list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-real" "") list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "") diff --git a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp index 219dbe27682..8635b1280de 100644 --- a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp +++ b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,13 @@ namespace cudf { namespace strings { namespace udf { +/** + * @brief Get the CUDA version used at build time. + * + * @return The CUDA version as an integer, parsed as major * 1000 + minor * 10. + */ +int get_cuda_build_version(); + class udf_string; /** diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu index 9cf86b5ea48..941e61e6787 100644 --- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu +++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu @@ -101,6 +101,8 @@ void free_udf_string_array(cudf::strings::udf::udf_string* d_strings, // external APIs +int get_cuda_build_version() { return CUDA_VERSION; } + std::unique_ptr to_string_view_array(cudf::column_view const input) { return detail::to_string_view_array(input, cudf::get_default_stream()); From e3ba131baf340dfcf575abc99a872cdb36671307 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 06:48:41 -1000 Subject: [PATCH 070/340] Support timezone aware pandas inputs in cudf (#15935) closes #13611 (This technically does not support pandas objects have interval types that are timezone aware) @rjzamora let me know if the test I adapted from your PR in https://github.com/rapidsai/cudf/pull/15929 is adequate Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15935 --- python/cudf/cudf/core/column/column.py | 27 +++++-------------- python/cudf/cudf/core/index.py | 11 +++----- .../cudf/tests/series/test_datetimelike.py | 13 +++++++++ python/cudf/cudf/tests/test_datetime.py | 26 +++--------------- .../dask_cudf/io/tests/test_parquet.py | 20 ++++++++++++++ 5 files changed, 48 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 475d52d0fbb..f87797a1fa3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -332,10 +332,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: "yet supported in pyarrow, see: " "https://github.com/apache/arrow/issues/20213" ) - elif pa.types.is_timestamp(array.type) and array.type.tz is not None: - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) elif pa.types.is_large_string(array.type): @@ -992,9 +988,9 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: return col elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): return col.as_decimal_column(dtype) - elif np.issubdtype(cast(Any, dtype), np.datetime64): + elif dtype.kind == "M": return col.as_datetime_column(dtype) - elif np.issubdtype(cast(Any, dtype), np.timedelta64): + elif dtype.kind == "m": return col.as_timedelta_column(dtype) elif dtype.kind == "O": if cudf.get_option("mode.pandas_compatible") and was_object: @@ -1846,21 +1842,11 @@ def as_column( and arbitrary.freq is not None ): raise NotImplementedError("freq is not implemented yet") - elif ( - isinstance(arbitrary.dtype, pd.DatetimeTZDtype) - or ( - isinstance(arbitrary.dtype, pd.IntervalDtype) - and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype) - ) - or ( - isinstance(arbitrary.dtype, pd.CategoricalDtype) - and isinstance( - arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype - ) - ) + elif isinstance(arbitrary.dtype, pd.IntervalDtype) and isinstance( + arbitrary.dtype.subtype, pd.DatetimeTZDtype ): raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" + "cuDF does not yet support Intervals with timezone-aware datetimes" ) elif _is_pandas_nullable_extension_dtype(arbitrary.dtype): if cudf.get_option("mode.pandas_compatible"): @@ -1876,7 +1862,8 @@ def as_column( length=length, ) elif isinstance( - arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype) + arbitrary.dtype, + (pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype), ): return as_column( pa.array(arbitrary, from_pandas=True), diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7297ac4e929..732e5cdb01a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1757,13 +1757,10 @@ def __init__( name = _getdefault_name(data, name=name) data = column.as_column(data) - # TODO: Remove this if statement and fix tests now that - # there's timezone support - if isinstance(data.dtype, pd.DatetimeTZDtype): - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) - data = data.astype(dtype) + # TODO: if data.dtype.kind == "M" (i.e. data is already datetime type) + # We probably shouldn't always astype to datetime64[ns] + if not isinstance(data.dtype, pd.DatetimeTZDtype): + data = data.astype(dtype) if copy: data = data.copy() diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 7ef55761b2b..58ffc610c3c 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -223,3 +223,16 @@ def test_contains_tz_aware(item, expected): def test_tz_convert_naive_typeerror(): with pytest.raises(TypeError): cudf.date_range("2020", periods=2, freq="D").tz_convert(None) + + +@pytest.mark.parametrize( + "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"] +) +def test_from_pandas_obj_tz_aware(klass): + tz_aware_data = [ + pd.Timestamp("2020-01-01", tz="UTC").tz_convert("US/Pacific") + ] + pandas_obj = getattr(pd, klass)(tz_aware_data) + result = cudf.from_pandas(pandas_obj) + expected = getattr(cudf, klass)(tz_aware_data) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4186fff038a..e3ecaafae5b 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2088,25 +2088,6 @@ def test_datetime_constructor(data, dtype): assert_eq(expected, actual) -@pytest.mark.parametrize( - "data", - [ - [pd.Timestamp("2001-01-01", tz="America/New_York")], - pd.Series(["2001-01-01"], dtype="datetime64[ns, America/New_York]"), - pd.Index(["2001-01-01"], dtype="datetime64[ns, America/New_York]"), - ], -) -def test_construction_from_tz_timestamps(data): - with pytest.raises(NotImplementedError): - _ = cudf.Series(data) - with pytest.raises(NotImplementedError): - _ = cudf.Index(data) - with pytest.raises(NotImplementedError): - _ = cudf.DatetimeIndex(data) - with pytest.raises(NotImplementedError): - cudf.CategoricalIndex(data) - - @pytest.mark.parametrize("op", _cmpops) def test_datetime_binop_tz_timestamp(op): s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") @@ -2391,13 +2372,14 @@ def test_datetime_raise_warning(freqstr): t.dt.ceil(freqstr) -def test_timezone_array_notimplemented(): +def test_timezone_pyarrow_array(): pa_array = pa.array( [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)], type=pa.timestamp("ns", "UTC"), ) - with pytest.raises(NotImplementedError): - cudf.Series(pa_array) + result = cudf.Series(pa_array) + expected = pa_array.to_pandas() + assert_eq(result, expected) def test_to_datetime_errors_ignore_deprecated(): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 39800145585..f3e3911e6c7 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -596,3 +596,23 @@ def test_parquet_read_filter_and_project(tmpdir): # Check result expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True) dd.assert_eq(got, expected) + + +def test_timezone_column(tmpdir): + path = str(tmpdir.join("test.parquet")) + pdf = pd.DataFrame( + { + "time": pd.to_datetime( + ["1996-01-02", "1996-12-01"], + utc=True, + ), + "x": [1, 2], + } + ) + pdf.to_parquet(path) + got = dask_cudf.read_parquet(path) + # cudf.read_parquet does not support reading timezone aware types yet + assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC") + got["time"] = got["time"].astype("datetime64[ns]") + expected = cudf.read_parquet(path) + dd.assert_eq(got, expected) From f9b0fc3d1986d5ac8994c09229d62063854c0856 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 08:34:15 -1000 Subject: [PATCH 071/340] Preserve column type and class information in more DataFrame operations (#15949) Narrowing down to a pattern of using `ColumnAccessor._from_columns_like_self` to preserve the column information and then calling `Frame._from_data_like_self` to preserve the `.index`/`.name` information. This is specifically for operations that operates column wise and the result should be the same shape as the input. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15949 --- python/cudf/cudf/core/dataframe.py | 3 +- python/cudf/cudf/core/indexed_frame.py | 131 +++++++++++------------ python/cudf/cudf/core/window/rolling.py | 41 ++----- python/cudf/cudf/tests/test_dataframe.py | 12 ++- 4 files changed, 83 insertions(+), 104 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9307267b227..e1b6cc45dd3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2688,6 +2688,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: self._data = ColumnAccessor( data=dict(zip(other.names, self._data.columns)), multiindex=other.multiindex, + rangeindex=other.rangeindex, level_names=other.level_names, label_dtype=other.label_dtype, verify=False, @@ -7534,7 +7535,7 @@ def _sample_axis_1( def _from_columns_like_self( self, columns: List[ColumnBase], - column_names: abc.Iterable[str], + column_names: Optional[abc.Iterable[str]] = None, index_names: Optional[List[str]] = None, *, override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d898eb4b9c3..fdc78005996 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -40,8 +40,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, - is_bool_dtype, - is_decimal_dtype, is_dict_like, is_list_like, is_scalar, @@ -372,7 +370,6 @@ def _mimic_inplace( self._index = result.index return super()._mimic_inplace(result, inplace) - # Scans @_cudf_nvtx_annotate def _scan(self, op, axis=None, skipna=True): """ @@ -417,8 +414,8 @@ def _scan(self, op, axis=None, skipna=True): cast_to_int = op in ("cumsum", "cumprod") skipna = True if skipna is None else skipna - results = {} - for name, col in self._data.items(): + results = [] + for col in self._columns: if skipna: result_col = col.nans_to_nulls() else: @@ -429,19 +426,14 @@ def _scan(self, op, axis=None, skipna=True): else: result_col = col - if ( - cast_to_int - and not is_decimal_dtype(result_col.dtype) - and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ) - ): + if cast_to_int and result_col.dtype.kind in "uib": # For reductions that accumulate a value (e.g. sum, not max) # pandas returns an int64 dtype for all int or bool dtypes. result_col = result_col.astype(np.int64) - results[name] = getattr(result_col, op)() - return self._from_data(results, self.index) + results.append(getattr(result_col, op)()) + return self._from_data_like_self( + self._data._from_columns_like_self(results) + ) def _check_data_index_length_match(self) -> None: # Validate that the number of rows in the data matches the index if the @@ -880,7 +872,6 @@ def replace( FutureWarning, ) if not (to_replace is None and value is no_default): - copy_data = {} ( all_na_per_column, to_replace_per_column, @@ -890,10 +881,10 @@ def replace( value=value, columns_dtype_map=dict(self._dtypes), ) - + copy_data = [] for name, col in self._data.items(): try: - copy_data[name] = col.find_and_replace( + replaced = col.find_and_replace( to_replace_per_column[name], replacements_per_column[name], all_na_per_column[name], @@ -906,11 +897,13 @@ def replace( # that exists in `copy_data`. # ii. There is an OverflowError while trying to cast # `to_replace_per_column` to `replacements_per_column`. - copy_data[name] = col.copy(deep=True) + replaced = col.copy(deep=True) + copy_data.append(replaced) + result = self._from_data_like_self( + self._data._from_columns_like_self(copy_data) + ) else: - copy_data = self._data.copy(deep=True) - - result = self._from_data(copy_data, self.index) + result = self.copy() return self._mimic_inplace(result, inplace=inplace) @@ -1031,12 +1024,13 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): ): lower[0], upper[0] = upper[0], lower[0] - data = { - name: col.clip(lower[i], upper[i]) - for i, (name, col) in enumerate(self._data.items()) - } - output = self._from_data(data, self.index) - output._copy_type_metadata(self, include_index=False) + data = ( + col.clip(low, high) + for col, low, high in zip(self._columns, lower, upper) + ) + output = self._from_data_like_self( + self._data._from_columns_like_self(data) + ) return self._mimic_inplace(output, inplace=inplace) @_cudf_nvtx_annotate @@ -1913,7 +1907,7 @@ def nans_to_nulls(self): 2 """ result = [] - for col in self._data.columns: + for col in self._columns: converted = col.nans_to_nulls() if converted is col: converted = converted.copy() @@ -2028,8 +2022,8 @@ def interpolate( ) interpolator = cudf.core.algorithms.get_column_interpolator(method) - columns = {} - for colname, col in data._data.items(): + columns = [] + for col in data._columns: if isinstance(col, cudf.core.column.StringColumn): warnings.warn( f"{type(self).__name__}.interpolate with object dtype is " @@ -2040,9 +2034,12 @@ def interpolate( col = col.astype("float64").fillna(np.nan) # Interpolation methods may or may not need the index - columns[colname] = interpolator(col, index=data.index) + columns.append(interpolator(col, index=data.index)) - result = self._from_data(columns, index=data.index) + result = self._from_data_like_self( + self._data._from_columns_like_self(columns) + ) + result.index = data.index return ( result @@ -2069,8 +2066,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): data_columns = ( col.shift(periods, fill_value) for col in self._columns ) - return self.__class__._from_data( - zip(self._column_names, data_columns), self.index + return self._from_data_like_self( + self._data._from_columns_like_self(data_columns) ) @_cudf_nvtx_annotate @@ -3011,8 +3008,6 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: self._column_names, None if has_range_index or not keep_index else self.index.names, ) - result._data.label_dtype = self._data.label_dtype - result._data.rangeindex = self._data.rangeindex if keep_index and has_range_index: result.index = self.index[start:stop] @@ -3561,11 +3556,6 @@ def sort_values( ), keep_index=not ignore_index, ) - if ( - isinstance(self, cudf.core.dataframe.DataFrame) - and self._data.multiindex - ): - out.columns = self._data.to_pandas_index() return out def _n_largest_or_smallest( @@ -3659,14 +3649,12 @@ def _align_to_index( result = result.sort_values(sort_col_id) del result[sort_col_id] - result = self.__class__._from_data( - data=result._data, index=result.index + out = self._from_data( + self._data._from_columns_like_self(result._columns) ) - result._data.multiindex = self._data.multiindex - result._data._level_names = self._data._level_names - result.index.names = self.index.names - - return result + out.index = result.index + out.index.names = self.index.names + return out @_cudf_nvtx_annotate def _reindex( @@ -3898,24 +3886,14 @@ def round(self, decimals=0, how="half_even"): "decimals must be an integer, a dict-like or a Series" ) - cols = { - name: col.round(decimals[name], how=how) - if ( - name in decimals - and _is_non_decimal_numeric_dtype(col.dtype) - and not is_bool_dtype(col.dtype) - ) + cols = ( + col.round(decimals[name], how=how) + if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) for name, col in self._data.items() - } - - return self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ), - index=self.index, + ) + return self._from_data_like_self( + self._data._from_columns_like_self(cols) ) def resample( @@ -6238,6 +6216,8 @@ def rank( f"axis={axis} is not yet supported in rank" ) + num_cols = self._num_columns + dropped_cols = False source = self if numeric_only: if isinstance( @@ -6255,15 +6235,28 @@ def rank( source = self._get_columns_by_label(numeric_cols) if source.empty: return source.astype("float64") + elif source._num_columns != num_cols: + dropped_cols = True result_columns = libcudf.sort.rank_columns( [*source._columns], method_enum, na_option, ascending, pct ) - return self.__class__._from_data( - dict(zip(source._column_names, result_columns)), - index=source.index, - ).astype(np.float64) + if dropped_cols: + result = type(source)._from_data( + ColumnAccessor( + dict(zip(source._column_names, result_columns)), + multiindex=self._data.multiindex, + level_names=self._data.level_names, + label_dtype=self._data.label_dtype, + ), + ) + else: + result = source._from_data_like_self( + self._data._from_columns_like_self(result_columns) + ) + result.index = source.index + return result.astype(np.float64) def convert_dtypes( self, diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2037b1682db..7d140a1ffa5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,7 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION -import itertools - import numba import pandas as pd from pandas.api.indexers import BaseIndexer @@ -251,27 +249,13 @@ def _apply_agg_column(self, source_column, agg_name): agg_params=self.agg_params, ) - def _apply_agg_dataframe(self, df, agg_name): - return cudf.DataFrame._from_data( - { - col_name: self._apply_agg_column(col, agg_name) - for col_name, col in df._data.items() - }, - index=df.index, - ) - def _apply_agg(self, agg_name): - if isinstance(self.obj, cudf.Series): - return cudf.Series._from_data( - { - self.obj.name: self._apply_agg_column( - self.obj._column, agg_name - ) - }, - index=self.obj.index, - ) - else: - return self._apply_agg_dataframe(self.obj, agg_name) + applied = ( + self._apply_agg_column(col, agg_name) for col in self.obj._columns + ) + return self.obj._from_data_like_self( + self.obj._data._from_columns_like_self(applied) + ) def _reduce( self, @@ -533,18 +517,9 @@ def _window_to_window_sizes(self, window): ) def _apply_agg(self, agg_name): - index = cudf.MultiIndex.from_frame( - cudf.DataFrame( - { - key: value - for key, value in itertools.chain( - self._group_keys._data.items(), - self.obj.index._data.items(), - ) - } - ) + index = cudf.MultiIndex._from_data( + {**self._group_keys._data, **self.obj.index._data} ) - result = super()._apply_agg(agg_name) result.index = index return result diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d76d5eb8065..98e9f9881c7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10980,7 +10980,7 @@ def test_squeeze(axis, data): assert_eq(result, expected) -@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)]) +@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)]) @pytest.mark.parametrize( "operation", [ @@ -10991,6 +10991,16 @@ def test_squeeze(axis, data): lambda df: abs(df), lambda df: -df, lambda df: ~df, + lambda df: df.cumsum(), + lambda df: df.replace(1, 2), + lambda df: df.replace(10, 20), + lambda df: df.clip(0, 10), + lambda df: df.rolling(1).mean(), + lambda df: df.interpolate(), + lambda df: df.shift(), + lambda df: df.sort_values(1), + lambda df: df.round(), + lambda df: df.rank(), ], ) def test_op_preserves_column_metadata(column, operation): From 58a15a84078c42b331ced4fd4384724d42328258 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 10 Jun 2024 11:42:11 -0700 Subject: [PATCH 072/340] Explicitly build for all GPU architectures (#15959) The libcudf conda package is not specifying to build for all supported architectures and is instead letting build.sh fall back to NATIVE. However, because the default behavior of rapids-cmake is to build SASS for all supported architectures if NATIVE is specified but no local architecture is detected, we're still ending up with all of the RAPIDS architectures having SASS built for them. The problem is that we are failing to build PTX for the latest version, which would be produced if we used RAPIDS instead of NATIVE. This PR should resolve that issue. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Robert Maynard (https://github.com/robertmaynard) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/15959 --- conda/recipes/libcudf/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh index fef3dabd733..a3a0415575b 100644 --- a/conda/recipes/libcudf/build.sh +++ b/conda/recipes/libcudf/build.sh @@ -5,5 +5,5 @@ export cudf_ROOT="$(realpath ./cpp/build)" ./build.sh -n -v \ libcudf libcudf_kafka benchmarks tests \ - --build_metrics --incl_cache_stats \ + --build_metrics --incl_cache_stats --allgpuarch \ --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\" From 719a8a6934ae5eaeb22764d1bfdeb75893750bae Mon Sep 17 00:00:00 2001 From: Ray Bell Date: Mon, 10 Jun 2024 15:57:17 -0400 Subject: [PATCH 073/340] Update PandasCompat.py to resolve references (#15704) This PR allows the PandasCompat sphinx ext to contain resolved references. For example, you can now add intersphinx mapping to the content of the admonition. ### Motivation I enjoy connecting the PyData communities and this PR allows for more opportunities to use intersphinx mapping to link back to the pandas docs. ### History I first tried this in a previous PR (https://github.com/rapidsai/cudf/pull/15383#discussion_r1537888240) and commented here (https://github.com/rapidsai/cudf/pull/15383#issuecomment-2028451487) that I may get around to investigating this further. I finally had to time to work on this and made a bit of progress. ### Testing I created a separate repo for this at https://github.com/raybellwaves/compatsphinxext which deploys straight to https://raybellwaves.github.io/compatsphinxext you can see it's working as expected here: https://raybellwaves.github.io/compatsphinxext/compat.html. You should be able to fork that and tinker pretty quickly. ### Further work This could be cleaned up (for example I couldn't get the [source] to display in the admonition as I worked from the latest sphinx todo extension (https://github.com/sphinx-doc/sphinx/blob/master/sphinx/ext/todo.py)). The existing pandas-compat Admonition's could be switched to this if agreed. In addition, the documentation around how to write pandas-compat entries going forward (https://github.com/rapidsai/cudf/blob/branch-24.06/docs/cudf/source/developer_guide/documentation.md#comparing-to-pandas) will also have to be updated. Longer term the extension could be published and used across RAPIDS libraries where there are differences in compatibility with PyData libraries e.g. pandas, network, scikit-learn to simplify linking to those dos. I'm not sure if I'll have time to work on this though. Authors: - Ray Bell (https://github.com/raybellwaves) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15704 --- docs/cudf/source/_ext/PandasCompat.py | 143 +++++++++++++++++--------- docs/cudf/source/conf.py | 2 + 2 files changed, 94 insertions(+), 51 deletions(-) diff --git a/docs/cudf/source/_ext/PandasCompat.py b/docs/cudf/source/_ext/PandasCompat.py index af2b16035c3..331495c981e 100644 --- a/docs/cudf/source/_ext/PandasCompat.py +++ b/docs/cudf/source/_ext/PandasCompat.py @@ -1,14 +1,20 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION +# Copyright (c) 2021-2024, NVIDIA CORPORATION # This file is adapted from official sphinx tutorial for `todo` extension: # https://www.sphinx-doc.org/en/master/development/tutorials/todo.html +from __future__ import annotations + +from typing import cast from docutils import nodes +from docutils.nodes import Element from docutils.parsers.rst import Directive -from sphinx.locale import get_translation -from sphinx.util.docutils import SphinxDirective - -translator = get_translation("sphinx") +from docutils.parsers.rst.directives.admonitions import BaseAdmonition +from sphinx import addnodes +from sphinx.domains import Domain +from sphinx.errors import NoUri +from sphinx.locale import _ as get_translation_sphinx +from sphinx.util.docutils import SphinxDirective, new_document class PandasCompat(nodes.Admonition, nodes.Element): @@ -32,7 +38,7 @@ def run(self): return [PandasCompatList("")] -class PandasCompatDirective(SphinxDirective): +class PandasCompatDirective(BaseAdmonition, SphinxDirective): # this enables content in the directive has_content = True @@ -43,9 +49,11 @@ def run(self): PandasCompat_node = PandasCompat("\n".join(self.content)) PandasCompat_node += nodes.title( - translator("Pandas Compatibility Note"), - translator("Pandas Compatibility Note"), + get_translation_sphinx("Pandas Compatibility Note"), + get_translation_sphinx("Pandas Compatibility Note"), ) + PandasCompat_node["docname"] = self.env.docname + PandasCompat_node["target"] = targetnode self.state.nested_parse( self.content, self.content_offset, PandasCompat_node ) @@ -84,71 +92,104 @@ def merge_PandasCompats(app, env, docnames, other): ) -def process_PandasCompat_nodes(app, doctree, fromdocname): - if not app.config.include_pandas_compat: - for node in doctree.traverse(PandasCompat): - node.parent.remove(node) +class PandasCompatDomain(Domain): + name = "pandascompat" + label = "pandascompat" - # Replace all PandasCompatList nodes with a list of the collected - # PandasCompats. Augment each PandasCompat with a backlink to the - # original location. - env = app.builder.env + @property + def pandascompats(self): + return self.data.setdefault("pandascompats", {}) - if not hasattr(env, "PandasCompat_all_pandas_compat"): - env.PandasCompat_all_pandas_compat = [] + def clear_doc(self, docname): + self.pandascompats.pop(docname, None) + + def merge_domaindata(self, docnames, otherdata): + for docname in docnames: + self.pandascompats[docname] = otherdata["pandascompats"][docname] + + def process_doc(self, env, docname, document): + pandascompats = self.pandascompats.setdefault(docname, []) + for pandascompat in document.findall(PandasCompat): + env.app.emit("pandascompat-defined", pandascompat) + pandascompats.append(pandascompat) - for node in doctree.traverse(PandasCompatList): - if not app.config.include_pandas_compat: - node.replace_self([]) - continue - content = [] +class PandasCompatListProcessor: + def __init__(self, app, doctree, docname): + self.builder = app.builder + self.config = app.config + self.env = app.env + self.domain = cast(PandasCompatDomain, app.env.get_domain("pandascompat")) + self.document = new_document("") + self.process(doctree, docname) - for PandasCompat_info in env.PandasCompat_all_pandas_compat: - para = nodes.paragraph() + def process(self, doctree: nodes.document, docname: str) -> None: + pandascompats = [v for vals in self.domain.pandascompats.values() for v in vals] + for node in doctree.findall(PandasCompatList): + if not self.config.include_pandas_compat: + node.parent.remove(node) + continue - # Create a reference back to the original docstring - newnode = nodes.reference("", "") - innernode = nodes.emphasis( - translator("[source]"), translator("[source]") - ) - newnode["refdocname"] = PandasCompat_info["docname"] - newnode["refuri"] = app.builder.get_relative_uri( - fromdocname, PandasCompat_info["docname"] - ) - newnode["refuri"] += "#" + PandasCompat_info["target"]["refid"] - newnode.append(innernode) - para += newnode + content: list[Element | None] = [nodes.target()] if node.get("ids") else [] - # Insert the reference node into PandasCompat node - # Note that this node is a deepcopy from the original copy - # in the docstring, so changing this does not affect that in the - # doc. - PandasCompat_info["PandasCompat"].append(para) + for pandascompat in pandascompats: + # Create a copy of the pandascompat node + new_pandascompat = pandascompat.deepcopy() + new_pandascompat["ids"].clear() - # Insert the PandasCompand node into the PandasCompatList Node - content.append(PandasCompat_info["PandasCompat"]) + self.resolve_reference(new_pandascompat, docname) + content.append(new_pandascompat) - node.replace_self(content) + ref = self.create_reference(pandascompat, docname) + content.append(ref) + + node.replace_self(content) + + def create_reference(self, pandascompat, docname): + para = nodes.paragraph() + newnode = nodes.reference("", "") + innernode = nodes.emphasis( + get_translation_sphinx("[source]"), get_translation_sphinx("[source]") + ) + newnode["refdocname"] = pandascompat["docname"] + try: + newnode["refuri"] = self.builder.get_relative_uri( + docname, pandascompat["docname"] + ) + "#" + pandascompat["target"]["refid"] + except NoUri: + # ignore if no URI can be determined, e.g. for LaTeX output + pass + newnode.append(innernode) + para += newnode + return para + + def resolve_reference(self, todo, docname: str) -> None: + """Resolve references in the todo content.""" + for node in todo.findall(addnodes.pending_xref): + if "refdoc" in node: + node["refdoc"] = docname + + # Note: To resolve references, it is needed to wrap it with document node + self.document += todo + self.env.resolve_references(self.document, docname, self.builder) + self.document.remove(todo) def setup(app): app.add_config_value("include_pandas_compat", False, "html") - app.add_node(PandasCompatList) app.add_node( PandasCompat, html=(visit_PandasCompat_node, depart_PandasCompat_node), latex=(visit_PandasCompat_node, depart_PandasCompat_node), text=(visit_PandasCompat_node, depart_PandasCompat_node), + man=(visit_PandasCompat_node, depart_PandasCompat_node), + texinfo=(visit_PandasCompat_node, depart_PandasCompat_node), ) - - # Sphinx directives are lower-cased app.add_directive("pandas-compat", PandasCompatDirective) app.add_directive("pandas-compat-list", PandasCompatListDirective) - app.connect("doctree-resolved", process_PandasCompat_nodes) - app.connect("env-purge-doc", purge_PandasCompats) - app.connect("env-merge-info", merge_PandasCompats) + app.add_domain(PandasCompatDomain) + app.connect("doctree-resolved", PandasCompatListProcessor) return { "version": "0.1", diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 73d8b4445d3..e9c760e288e 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -617,6 +617,8 @@ def linkcode_resolve(domain, info) -> str | None: f"branch-{version}/python/cudf/cudf/{fn}{linespec}" ) +# Needed for avoid build warning for PandasCompat extension +suppress_warnings = ["myst.domains"] def setup(app): app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") From 570df6c5fbb0a2120b539aba0a65702c2190527f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 15:24:40 -1000 Subject: [PATCH 074/340] Add typing to single_column_frame (#15965) Also removes an extra copy from `.flatten()` when calling `.values` or `.values_host` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/15965 --- python/cudf/cudf/api/types.py | 7 ++- python/cudf/cudf/core/column/column.py | 4 +- python/cudf/cudf/core/single_column_frame.py | 58 ++++++++------------ 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 417d8b0922a..42b1524bd76 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -8,7 +8,7 @@ from collections import abc from functools import wraps from inspect import isclass -from typing import List, Union +from typing import List, Union, cast import cupy as cp import numpy as np @@ -238,7 +238,10 @@ def _union_categoricals( raise TypeError("ignore_order is not yet implemented") result_col = cudf.core.column.CategoricalColumn._concat( - [obj._column for obj in to_union] + [ + cast(cudf.core.column.CategoricalColumn, obj._column) + for obj in to_union + ] ) if sort_categories: sorted_categories = result_col.categories.sort_values(ascending=True) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f87797a1fa3..7abdbc85720 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -212,7 +212,7 @@ def to_pandas( return pd.Index(pa_array.to_pandas()) @property - def values_host(self) -> "np.ndarray": + def values_host(self) -> np.ndarray: """ Return a numpy representation of the Column. """ @@ -226,7 +226,7 @@ def values_host(self) -> "np.ndarray": return self.data_array_view(mode="read").copy_to_host() @property - def values(self) -> "cupy.ndarray": + def values(self) -> cupy.ndarray: """ Return a CuPy representation of the Column. """ diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index d864b563208..acc74129a29 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -7,9 +7,11 @@ import cupy import numpy +import pyarrow as pa +from typing_extensions import Self import cudf -from cudf._typing import Dtype, NotImplementedType, ScalarLike +from cudf._typing import NotImplementedType, ScalarLike from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, @@ -27,8 +29,8 @@ class SingleColumnFrame(Frame, NotIterable): """A one-dimensional frame. - Frames with only a single column share certain logic that is encoded in - this class. + Frames with only a single column (Index or Series) + share certain logic that is encoded in this class. """ _SUPPORT_AXIS_LOOKUP = { @@ -47,7 +49,7 @@ def _reduce( if axis not in (None, 0, no_default): raise NotImplementedError("axis parameter is not implemented yet") - if numeric_only and not is_numeric_dtype(self._column): + if numeric_only and not is_numeric_dtype(self.dtype): raise TypeError( f"Series.{op} does not allow numeric_only={numeric_only} " "with non-numeric dtypes." @@ -68,7 +70,7 @@ def _scan(self, op, axis=None, *args, **kwargs): @_cudf_nvtx_annotate def name(self): """Get the name of this object.""" - return next(iter(self._data.names)) + return next(iter(self._column_names)) @name.setter # type: ignore @_cudf_nvtx_annotate @@ -83,7 +85,7 @@ def ndim(self) -> int: # noqa: D401 @property # type: ignore @_cudf_nvtx_annotate - def shape(self): + def shape(self) -> tuple[int]: """Get a tuple representing the dimensionality of the Index.""" return (len(self),) @@ -95,45 +97,27 @@ def __bool__(self): @property # type: ignore @_cudf_nvtx_annotate - def _num_columns(self): + def _num_columns(self) -> int: return 1 @property # type: ignore @_cudf_nvtx_annotate - def _column(self): - return self._data[self.name] + def _column(self) -> ColumnBase: + return next(iter(self._columns)) @property # type: ignore @_cudf_nvtx_annotate - def values(self): # noqa: D102 + def values(self) -> cupy.ndarray: # noqa: D102 return self._column.values @property # type: ignore @_cudf_nvtx_annotate - def values_host(self): # noqa: D102 + def values_host(self) -> numpy.ndarray: # noqa: D102 return self._column.values_host - @_cudf_nvtx_annotate - def to_cupy( - self, - dtype: Union[Dtype, None] = None, - copy: bool = True, - na_value=None, - ) -> cupy.ndarray: # noqa: D102 - return super().to_cupy(dtype, copy, na_value).flatten() - - @_cudf_nvtx_annotate - def to_numpy( - self, - dtype: Union[Dtype, None] = None, - copy: bool = True, - na_value=None, - ) -> numpy.ndarray: # noqa: D102 - return super().to_numpy(dtype, copy, na_value).flatten() - @classmethod @_cudf_nvtx_annotate - def from_arrow(cls, array): + def from_arrow(cls, array) -> Self: """Create from PyArrow Array/ChunkedArray. Parameters @@ -164,7 +148,7 @@ def from_arrow(cls, array): return cls(ColumnBase.from_arrow(array)) @_cudf_nvtx_annotate - def to_arrow(self): + def to_arrow(self) -> pa.Array: """ Convert to a PyArrow Array. @@ -196,7 +180,7 @@ def to_arrow(self): @property # type: ignore @_cudf_nvtx_annotate - def is_unique(self): + def is_unique(self) -> bool: """Return boolean if values in the object are unique. Returns @@ -207,7 +191,7 @@ def is_unique(self): @property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """Return boolean if values in the object are monotonically increasing. Returns @@ -218,7 +202,7 @@ def is_monotonic_increasing(self): @property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """Return boolean if values in the object are monotonically decreasing. Returns @@ -243,7 +227,9 @@ def __cuda_array_interface__(self): ) @_cudf_nvtx_annotate - def factorize(self, sort=False, use_na_sentinel=True): + def factorize( + self, sort: bool = False, use_na_sentinel: bool = True + ) -> tuple[cupy.ndarray, cudf.Index]: """Encode the input values as integer labels. Parameters @@ -335,7 +321,7 @@ def _make_operands_for_binop( return {result_name: (self._column, other, reflect, fill_value)} @_cudf_nvtx_annotate - def nunique(self, dropna: bool = True): + def nunique(self, dropna: bool = True) -> int: """ Return count of unique values for the column. From 1bd210d76ab05c669aea230b9287b76a03328efa Mon Sep 17 00:00:00 2001 From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com> Date: Mon, 10 Jun 2024 21:35:46 -0400 Subject: [PATCH 075/340] Add external issue label and project automation (#15945) This PR creates two new GitHub Actions around issue and PR tracking ### `external_issue_labeler.yml` This action automatically adds a label, currently `External`, to any issue or PR that is opened by someone that is not either an owner, member, or collaborator to the cuDF repo ### `pr_issue_status_automation.yml` This action uses the [shared workflows](https://github.com/rapidsai/shared-workflows/tree/branch-24.08/.github/workflows) in rapdsai/shared-workflows to, on open/edit/synchronize of an open PR, to: 1. Set the PR to `in progress` 2. Set all linked issues `in progress` 3. Set the PR's sprint to the current iteration 4. Set all linked issues to the current iteration Edit triggers on edit of the PR description, (so new linked issues will get synchronized to `in progress`). Synchronize triggers on push and rebase events - this really is to cover the "what are we working on right now" because anything we touch goes into the current sprint in the project. Authors: - Ben Jarmak (https://github.com/jarmak-nv) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15945 --- .github/workflows/external_issue_labeler.yml | 55 ++++++++++++++++ .../workflows/pr_issue_status_automation.yml | 64 +++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 .github/workflows/external_issue_labeler.yml create mode 100644 .github/workflows/pr_issue_status_automation.yml diff --git a/.github/workflows/external_issue_labeler.yml b/.github/workflows/external_issue_labeler.yml new file mode 100644 index 00000000000..e6d987e9f34 --- /dev/null +++ b/.github/workflows/external_issue_labeler.yml @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Label external issues and PRs + +on: + issues: + types: + - opened + + pull_request: + types: + - opened + +env: + GITHUB_TOKEN: ${{ github.token }} + +permissions: + issues: write + pull-requests: write + +jobs: + Label-Issue: + runs-on: ubuntu-latest + # Only run if the issue author is not part of RAPIDS + if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}} + steps: + - name: add-external-labels + run: | + issue_url=${{ github.event.issue.html_url }} + gh issue edit ${issue_url} --add-label "External" + continue-on-error: true + + Label-PR: + runs-on: ubuntu-latest + # Only run if the issue author is not part of RAPIDS + if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}} + steps: + - name: add-external-labels + run: | + pr_url=${{ github.event.pull_request.html_url }} + gh issue edit ${pr_url} --add-label "External" + continue-on-error: true diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml new file mode 100644 index 00000000000..aaece1bfa3e --- /dev/null +++ b/.github/workflows/pr_issue_status_automation.yml @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Set PR and Issue Project Fields + +on: + pull_request_target: + # This job runs when a PR is first opened, or it is updated + # Only runs if the PR is open (we don't want to update the status of a closed PR) + types: [opened, edited, synchronize] + +jobs: + get-project-id: + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08 + if: github.event.pull_request.state == 'open' + secrets: inherit + permissions: + contents: read + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + + update-status: + # This job sets the PR and its linked issues to "In Progress" status + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08 + if: github.event.pull_request.state == 'open' + needs: get-project-id + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgaxNac" + SINGLE_SELECT_FIELD_NAME: "Status" + SINGLE_SELECT_OPTION_VALUE: "In Progress" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit + + update-sprint: + # This job sets the PR and its linked issues to the current "Weekly Sprint" + uses: jarmak-nv/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 + if: github.event.pull_request.state == 'open' + needs: get-project-id + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + ITERATION_FIELD_ID: "PVTIF_lADOAp2shc4AiNzlzgbU_po" + ITERATION_FIELD_NAME: "Weekly Sprint" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit From ff1e4bb82ce4ab8ac54bc8715bf761a3700024bc Mon Sep 17 00:00:00 2001 From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com> Date: Mon, 10 Jun 2024 19:34:00 -0700 Subject: [PATCH 076/340] Migrate left join and conditional join benchmarks to use nvbench (#15931) The current [left join](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/left_join.cu) and [conditional join](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/conditional_join.cu) benchmarks are still using gbench. This PR migrates the **left join** and **conditional join** benchmarks to use **nvbench**. Closes #15699. - [x] Migrate from gbench to nvbench - [x] Similar to #15644, use `JOIN_KEY_TYPE_RANGE`, `JOIN_NULLABLE_RANGE` and `JOIN_SIZE_RANGE` to reduce the number of test cases and simplify the implementation - [x] Get rid of the dispatching between gbench and nvbench in [join_common.hpp](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/join_common.hpp) Authors: - Srinivas Yadav (https://github.com/srinivasyadav18) - Yunsong Wang (https://github.com/PointKernel) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15931 --- cpp/benchmarks/CMakeLists.txt | 6 +- cpp/benchmarks/join/conditional_join.cu | 288 ++++-------------------- cpp/benchmarks/join/join_common.hpp | 99 +++----- cpp/benchmarks/join/left_join.cu | 152 ++++--------- 4 files changed, 116 insertions(+), 429 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 10f645dfec0..49504e53424 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -163,8 +163,10 @@ ConfigureNVBench( # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- -ConfigureBench(JOIN_BENCH join/left_join.cu join/conditional_join.cu) -ConfigureNVBench(JOIN_NVBENCH join/join.cu join/mixed_join.cu join/distinct_join.cu) +ConfigureNVBench( + JOIN_NVBENCH join/left_join.cu join/conditional_join.cu join/join.cu join/mixed_join.cu + join/distinct_join.cu +) # ################################################################################################## # * iterator benchmark ---------------------------------------------------------------------------- diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu index d95fc0a5b59..e332d09d31b 100644 --- a/cpp/benchmarks/join/conditional_join.cu +++ b/cpp/benchmarks/join/conditional_join.cu @@ -14,250 +14,44 @@ * limitations under the License. */ -#include - -template -class ConditionalJoin : public cudf::benchmark {}; - -// For compatibility with the shared logic for equality (hash) joins, all of -// the join lambdas defined by these macros accept a null_equality parameter -// but ignore it (don't forward it to the underlying join implementation) -// because conditional joins do not use this parameter. -#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_inner_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, false); -CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, false); -CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, true); -CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, true); - -#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_left_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, false); -CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, false); -CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, true); -CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, true); - -#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_full_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, false); -CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, false); -CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, true); -CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, true); - -#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_left_anti_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit, int32_t, false); -CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit, int64_t, false); -CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls, int32_t, true); -CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls, int64_t, true); - -#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_left_semi_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit, int32_t, false); -CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit, int64_t, false); -CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls, int32_t, true); -CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls, int64_t, true); - -// inner join ----------------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({400'000, 100'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({400'000, 100'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({400'000, 100'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({400'000, 100'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -// left join ----------------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -// full join ----------------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -// left anti-join ------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -// left semi-join ------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); +#include "join_common.hpp" + +template +void nvbench_conditional_inner_join(nvbench::state& state, + nvbench::type_list>) +{ + auto join = [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::ast::operation binary_pred, + cudf::null_equality compare_nulls) { + return cudf::conditional_inner_join(left, right, binary_pred); + }; + BM_join(state, join); +} + +template +void nvbench_conditional_left_join(nvbench::state& state, + nvbench::type_list>) +{ + auto join = [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::ast::operation binary_pred, + cudf::null_equality compare_nulls) { + return cudf::conditional_left_join(left, right, binary_pred); + }; + BM_join(state, join); +} + +NVBENCH_BENCH_TYPES(nvbench_conditional_inner_join, + NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE)) + .set_name("conditional_inner_join") + .set_type_axes_names({"Key", "Nullable"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE); + +NVBENCH_BENCH_TYPES(nvbench_conditional_left_join, + NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE)) + .set_name("conditional_left_join") + .set_type_axes_names({"Key", "Nullable"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE); diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp index e6792b9dbfb..3d9d9c57548 100644 --- a/cpp/benchmarks/join/join_common.hpp +++ b/cpp/benchmarks/join/join_common.hpp @@ -19,7 +19,6 @@ #include "generate_input_tables.cuh" #include -#include #include #include @@ -67,28 +66,12 @@ template void BM_join(state_type& state, Join JoinFunc) { - auto const right_size = [&]() { - if constexpr (std::is_same_v) { - return static_cast(state.range(0)); - } - if constexpr (std::is_same_v) { - return static_cast(state.get_int64("right_size")); - } - }(); - auto const left_size = [&]() { - if constexpr (std::is_same_v) { - return static_cast(state.range(1)); - } - if constexpr (std::is_same_v) { - return static_cast(state.get_int64("left_size")); - } - }(); + auto const right_size = static_cast(state.get_int64("right_size")); + auto const left_size = static_cast(state.get_int64("left_size")); - if constexpr (std::is_same_v) { - if (right_size > left_size) { - state.skip("Skip large right table"); - return; - } + if (right_size > left_size) { + state.skip("Skip large right table"); + return; } double const selectivity = 0.3; @@ -165,57 +148,37 @@ void BM_join(state_type& state, Join JoinFunc) // Setup join parameters and result table [[maybe_unused]] std::vector columns_to_join = {0}; - - // Benchmark the inner join operation - if constexpr (std::is_same_v and - (join_type != join_t::CONDITIONAL)) { - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - - auto result = JoinFunc(left_table.select(columns_to_join), - right_table.select(columns_to_join), - cudf::null_equality::UNEQUAL); - } - } - if constexpr (std::is_same_v and (join_type != join_t::CONDITIONAL)) { - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - if constexpr (join_type == join_t::MIXED) { - auto const col_ref_left_0 = cudf::ast::column_reference(0); - auto const col_ref_right_0 = - cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); - auto left_zero_eq_right_zero = - cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = JoinFunc(left_table.select(columns_to_join), - right_table.select(columns_to_join), - left_table.select({1}), - right_table.select({1}), - left_zero_eq_right_zero, - cudf::null_equality::UNEQUAL); - }); - } - if constexpr (join_type == join_t::HASH) { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = JoinFunc(left_table.select(columns_to_join), - right_table.select(columns_to_join), - cudf::null_equality::UNEQUAL); - }); - } - } - - // Benchmark conditional join - if constexpr (std::is_same_v and join_type == join_t::CONDITIONAL) { - // Common column references. + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + if constexpr (join_type == join_t::CONDITIONAL) { auto const col_ref_left_0 = cudf::ast::column_reference(0); auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); auto left_zero_eq_right_zero = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = JoinFunc(left_table, right_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL); - } + ; + }); + } + if constexpr (join_type == join_t::MIXED) { + auto const col_ref_left_0 = cudf::ast::column_reference(0); + auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_zero_eq_right_zero = + cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = JoinFunc(left_table.select(columns_to_join), + right_table.select(columns_to_join), + left_table.select({1}), + right_table.select({1}), + left_zero_eq_right_zero, + cudf::null_equality::UNEQUAL); + }); + } + if constexpr (join_type == join_t::HASH) { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = JoinFunc(left_table.select(columns_to_join), + right_table.select(columns_to_join), + cudf::null_equality::UNEQUAL); + }); } } diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu index 3e398e721fa..92123ce1621 100644 --- a/cpp/benchmarks/join/left_join.cu +++ b/cpp/benchmarks/join/left_join.cu @@ -14,115 +14,43 @@ * limitations under the License. */ -#include - -template -class Join : public cudf::benchmark {}; - -#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::null_equality compare_nulls) { \ - return cudf::left_anti_join(left, right, compare_nulls); \ - }; \ - BM_join(st, join); \ - } - -LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, false); -LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, false); -LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, true); -LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, true); - -#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::null_equality compare_nulls) { \ - return cudf::left_semi_join(left, right, compare_nulls); \ - }; \ - BM_join(st, join); \ - } - -LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, false); -LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, false); -LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, true); -LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, true); - -// left anti-join ------------------------------------------------------------- -BENCHMARK_REGISTER_F(Join, left_anti_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_anti_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_anti_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_anti_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); - -// left semi-join ------------------------------------------------------------- -BENCHMARK_REGISTER_F(Join, left_semi_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_semi_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_semi_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_semi_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); +#include "join_common.hpp" + +template +void nvbench_left_anti_join(nvbench::state& state, + nvbench::type_list>) +{ + auto join = [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls) { + return cudf::left_anti_join(left, right, compare_nulls); + }; + + BM_join(state, join); +} + +template +void nvbench_left_semi_join(nvbench::state& state, + nvbench::type_list>) +{ + auto join = [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls) { + return cudf::left_semi_join(left, right, compare_nulls); + }; + BM_join(state, join); +} + +NVBENCH_BENCH_TYPES(nvbench_left_anti_join, + NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE)) + .set_name("left_anti_join") + .set_type_axes_names({"Key", "Nullable"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE); + +NVBENCH_BENCH_TYPES(nvbench_left_semi_join, + NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE)) + .set_name("left_semi_join") + .set_type_axes_names({"Key", "Nullable"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE); From 66c2f4fded3aa5d83745fada3e4c4d5eee7895b2 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 11 Jun 2024 07:24:19 -0700 Subject: [PATCH 077/340] Condense pylibcudf data fixtures (#15958) Condense all pa_foo/plc_foo data fixtures into just foo, as recommended by https://github.com/rapidsai/cudf/pull/15839#discussion_r1626769872. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15958 --- .../cudf/cudf/pylibcudf_tests/test_copying.py | 499 ++++++++++-------- .../cudf/pylibcudf_tests/test_quantiles.py | 16 +- .../cudf/cudf/pylibcudf_tests/test_reshape.py | 20 +- .../pylibcudf_tests/test_string_capitalize.py | 54 +- .../pylibcudf_tests/test_string_contains.py | 15 +- .../cudf/pylibcudf_tests/test_string_find.py | 78 ++- 6 files changed, 358 insertions(+), 324 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index cd70ce4abf5..da3ca3a6d1e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -20,121 +20,104 @@ # TODO: Test nullable data @pytest.fixture(scope="module") -def pa_input_column(pa_type): +def input_column(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([1, 2, 3], type=pa_type) + pa_array = pa.array([1, 2, 3], type=pa_type) elif pa.types.is_string(pa_type): - return pa.array(["a", "b", "c"], type=pa_type) + pa_array = pa.array(["a", "b", "c"], type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.array([True, True, False], type=pa_type) + pa_array = pa.array([True, True, False], type=pa_type) elif pa.types.is_list(pa_type): # TODO: Add heterogenous sizes - return pa.array([[1], [2], [3]], type=pa_type) + pa_array = pa.array([[1], [2], [3]], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def input_column(pa_input_column): - return plc.interop.from_arrow(pa_input_column) + pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + else: + raise ValueError("Unsupported type") + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def pa_index_column(): +def index_column(): # Index column for testing gather/scatter, always integral. - return pa.array([1, 2, 3]) - - -@pytest.fixture(scope="module") -def index_column(pa_index_column): - return plc.interop.from_arrow(pa_index_column) + pa_array = pa.array([1, 2, 3]) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def pa_target_column(pa_type): +def target_column(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([4, 5, 6, 7, 8, 9], type=pa_type) + pa_array = pa.array([4, 5, 6, 7, 8, 9], type=pa_type) elif pa.types.is_string(pa_type): - return pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type) + pa_array = pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.array([False, True, True, False, True, False], type=pa_type) + pa_array = pa.array( + [False, True, True, False, True, False], type=pa_type + ) elif pa.types.is_list(pa_type): # TODO: Add heterogenous sizes - return pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) + pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.array( + pa_array = pa.array( [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], type=pa_type, ) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def target_column(pa_target_column): - return plc.interop.from_arrow(pa_target_column) + else: + raise ValueError("Unsupported type") + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture def mutable_target_column(target_column): - return target_column.copy() + _, plc_target_column = target_column + return plc_target_column.copy() @pytest.fixture(scope="module") -def pa_source_table(pa_input_column): - return pa.table([pa_input_column] * 3, [""] * 3) +def source_table(input_column): + pa_input_column, _ = input_column + pa_table = pa.table([pa_input_column] * 3, [""] * 3) + return pa_table, plc.interop.from_arrow(pa_table) @pytest.fixture(scope="module") -def source_table(pa_source_table): - return plc.interop.from_arrow(pa_source_table) +def target_table(target_column): + pa_target_column, _ = target_column + pa_table = pa.table([pa_target_column] * 3, [""] * 3) + return pa_table, plc.interop.from_arrow(pa_table) @pytest.fixture(scope="module") -def pa_target_table(pa_target_column): - return pa.table([pa_target_column] * 3, [""] * 3) - - -@pytest.fixture(scope="module") -def target_table(pa_target_table): - return plc.interop.from_arrow(pa_target_table) - - -@pytest.fixture(scope="module") -def pa_source_scalar(pa_type): +def source_scalar(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.scalar(1, type=pa_type) + pa_scalar = pa.scalar(1, type=pa_type) elif pa.types.is_string(pa_type): - return pa.scalar("a", type=pa_type) + pa_scalar = pa.scalar("a", type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.scalar(False, type=pa_type) + pa_scalar = pa.scalar(False, type=pa_type) elif pa.types.is_list(pa_type): # TODO: Longer list? - return pa.scalar([1], type=pa_type) + pa_scalar = pa.scalar([1], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.scalar({"v": 1}, type=pa_type) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def source_scalar(pa_source_scalar): - return plc.interop.from_arrow(pa_source_scalar) - - -@pytest.fixture(scope="module") -def pa_mask(pa_target_column): - return pa.array([True, False] * (len(pa_target_column) // 2)) + pa_scalar = pa.scalar({"v": 1}, type=pa_type) + else: + raise ValueError("Unsupported type") + return pa_scalar, plc.interop.from_arrow(pa_scalar) @pytest.fixture(scope="module") -def mask(pa_mask): - return plc.interop.from_arrow(pa_mask) +def mask(target_column): + pa_target_column, _ = target_column + pa_mask = pa.array([True, False] * (len(pa_target_column) // 2)) + return pa_mask, plc.interop.from_arrow(pa_mask) -def test_gather(target_table, pa_target_table, index_column, pa_index_column): +def test_gather(target_table, index_column): + pa_target_table, plc_target_table = target_table + pa_index_column, plc_index_column = index_column result = plc.copying.gather( - target_table, - index_column, + plc_target_table, + plc_index_column, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) expected = pa_target_table.take(pa_index_column) @@ -142,10 +125,11 @@ def test_gather(target_table, pa_target_table, index_column, pa_index_column): def test_gather_map_has_nulls(target_table): + _, plc_target_table = target_table gather_map = plc.interop.from_arrow(pa.array([0, 1, None])) with cudf_raises(ValueError): plc.copying.gather( - target_table, + plc_target_table, gather_map, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) @@ -185,16 +169,16 @@ def _pyarrow_boolean_mask_scatter_table(source, mask, target_table): def test_scatter_table( source_table, - pa_source_table, index_column, - pa_index_column, target_table, - pa_target_table, ): + pa_source_table, plc_source_table = source_table + pa_index_column, plc_index_column = index_column + pa_target_table, plc_target_table = target_table result = plc.copying.scatter( - source_table, - index_column, - target_table, + plc_source_table, + plc_index_column, + plc_target_table, ) if pa.types.is_list( @@ -247,68 +231,80 @@ def test_scatter_table_num_col_mismatch( source_table, index_column, target_table ): # Number of columns in source and target must match. + _, plc_source_table = source_table + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - plc.Table(source_table.columns()[:2]), - index_column, - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_index_column, + plc_target_table, ) def test_scatter_table_num_row_mismatch(source_table, target_table): # Number of rows in source and scatter map must match. + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - source_table, + plc_source_table, plc.interop.from_arrow( - pa.array(range(source_table.num_rows() * 2)) + pa.array(range(plc_source_table.num_rows() * 2)) ), - target_table, + plc_target_table, ) def test_scatter_table_map_has_nulls(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - source_table, - plc.interop.from_arrow(pa.array([None] * source_table.num_rows())), - target_table, + plc_source_table, + plc.interop.from_arrow( + pa.array([None] * plc_source_table.num_rows()) + ), + plc_target_table, ) def test_scatter_table_type_mismatch(source_table, index_column, target_table): + _, plc_source_table = source_table + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(TypeError): if is_integer( - dtype := target_table.columns()[0].type() + dtype := plc_target_table.columns()[0].type() ) or is_floating(dtype): - pa_array = pa.array([True] * source_table.num_rows()) + pa_array = pa.array([True] * plc_source_table.num_rows()) else: - pa_array = pa.array([1] * source_table.num_rows()) - ncol = source_table.num_columns() + pa_array = pa.array([1] * plc_source_table.num_rows()) + ncol = plc_source_table.num_columns() pa_table = pa.table([pa_array] * ncol, [""] * ncol) plc.copying.scatter( plc.interop.from_arrow(pa_table), - index_column, - target_table, + plc_index_column, + plc_target_table, ) def test_scatter_scalars( source_scalar, - pa_source_scalar, index_column, - pa_index_column, target_table, - pa_target_table, ): + pa_source_scalar, plc_source_scalar = source_scalar + pa_index_column, plc_index_column = index_column + pa_target_table, plc_target_table = target_table result = plc.copying.scatter( - [source_scalar] * target_table.num_columns(), - index_column, - target_table, + [plc_source_scalar] * plc_target_table.num_columns(), + plc_index_column, + plc_target_table, ) expected = _pyarrow_boolean_mask_scatter_table( - [pa_source_scalar] * target_table.num_columns(), + [pa_source_scalar] * plc_target_table.num_columns(), pc.invert( _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows) ), @@ -321,85 +317,103 @@ def test_scatter_scalars( def test_scatter_scalars_num_scalars_mismatch( source_scalar, index_column, target_table ): + _, plc_source_scalar = source_scalar + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - [source_scalar] * (target_table.num_columns() - 1), - index_column, - target_table, + [plc_source_scalar] * (plc_target_table.num_columns() - 1), + plc_index_column, + plc_target_table, ) def test_scatter_scalars_map_has_nulls(source_scalar, target_table): + _, plc_source_scalar = source_scalar + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - [source_scalar] * target_table.num_columns(), + [plc_source_scalar] * plc_target_table.num_columns(), plc.interop.from_arrow(pa.array([None, None])), - target_table, + plc_target_table, ) def test_scatter_scalars_type_mismatch(index_column, target_table): + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(TypeError): if is_integer( - dtype := target_table.columns()[0].type() + dtype := plc_target_table.columns()[0].type() ) or is_floating(dtype): - source_scalar = [plc.interop.from_arrow(pa.scalar(True))] + plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))] else: - source_scalar = [plc.interop.from_arrow(pa.scalar(1))] + plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))] plc.copying.scatter( - source_scalar * target_table.num_columns(), - index_column, - target_table, + plc_source_scalar * plc_target_table.num_columns(), + plc_index_column, + plc_target_table, ) def test_empty_like_column(input_column): - result = plc.copying.empty_like(input_column) - assert result.type() == input_column.type() + _, plc_input_column = input_column + result = plc.copying.empty_like(plc_input_column) + assert result.type() == plc_input_column.type() def test_empty_like_table(source_table): - result = plc.copying.empty_like(source_table) - assert result.num_columns() == source_table.num_columns() - for icol, rcol in zip(source_table.columns(), result.columns()): + _, plc_source_table = source_table + result = plc.copying.empty_like(plc_source_table) + assert result.num_columns() == plc_source_table.num_columns() + for icol, rcol in zip(plc_source_table.columns(), result.columns()): assert rcol.type() == icol.type() @pytest.mark.parametrize("size", [None, 10]) def test_allocate_like(input_column, size): - if is_fixed_width(input_column.type()): + _, plc_input_column = input_column + if is_fixed_width(plc_input_column.type()): result = plc.copying.allocate_like( - input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size + plc_input_column, + plc.copying.MaskAllocationPolicy.RETAIN, + size=size, + ) + assert result.type() == plc_input_column.type() + assert result.size() == ( + plc_input_column.size() if size is None else size ) - assert result.type() == input_column.type() - assert result.size() == (input_column.size() if size is None else size) else: with pytest.raises(TypeError): plc.copying.allocate_like( - input_column, + plc_input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size, ) def test_copy_range_in_place( - input_column, pa_input_column, mutable_target_column, pa_target_column + input_column, mutable_target_column, target_column ): + pa_input_column, plc_input_column = input_column + + pa_target_column, _ = target_column + if not is_fixed_width(mutable_target_column.type()): with pytest.raises(TypeError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) else: plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) expected = _pyarrow_boolean_mask_scatter_column( @@ -415,36 +429,40 @@ def test_copy_range_in_place( def test_copy_range_in_place_out_of_bounds( input_column, mutable_target_column ): + _, plc_input_column = input_column + if is_fixed_width(mutable_target_column.type()): with cudf_raises(IndexError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 5, - 5 + input_column.size(), + 5 + plc_input_column.size(), 0, ) def test_copy_range_in_place_different_types(mutable_target_column): if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: - input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) def test_copy_range_in_place_null_mismatch( - pa_input_column, mutable_target_column + input_column, mutable_target_column ): + pa_input_column, _ = input_column + if is_fixed_width(mutable_target_column.type()): pa_input_column = pc.if_else( _pyarrow_index_to_mask([0], len(pa_input_column)), @@ -462,15 +480,15 @@ def test_copy_range_in_place_null_mismatch( ) -def test_copy_range( - input_column, pa_input_column, target_column, pa_target_column -): - if is_fixed_width(dtype := target_column.type()) or is_string(dtype): +def test_copy_range(input_column, target_column): + pa_input_column, plc_input_column = input_column + pa_target_column, plc_target_column = target_column + if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): result = plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) expected = _pyarrow_boolean_mask_scatter_column( @@ -484,137 +502,152 @@ def test_copy_range( else: with pytest.raises(TypeError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) def test_copy_range_out_of_bounds(input_column, target_column): + _, plc_input_column = input_column + _, plc_target_column = target_column with cudf_raises(IndexError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 5, - 5 + input_column.size(), + 5 + plc_input_column.size(), 0, ) def test_copy_range_different_types(target_column): - if is_integer(dtype := target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + _, plc_target_column = target_column + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: - input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) -def test_shift( - target_column, pa_target_column, source_scalar, pa_source_scalar -): +def test_shift(target_column, source_scalar): + pa_source_scalar, plc_source_scalar = source_scalar + pa_target_column, plc_target_column = target_column shift = 2 - if is_fixed_width(dtype := target_column.type()) or is_string(dtype): - result = plc.copying.shift(target_column, shift, source_scalar) + if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + result = plc.copying.shift(plc_target_column, shift, plc_source_scalar) expected = pa.concat_arrays( [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] ) assert_column_eq(expected, result) else: with pytest.raises(TypeError): - plc.copying.shift(target_column, shift, source_scalar) + plc.copying.shift(plc_target_column, shift, source_scalar) def test_shift_type_mismatch(target_column): - if is_integer(dtype := target_column.type()) or is_floating(dtype): + _, plc_target_column = target_column + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): fill_value = plc.interop.from_arrow(pa.scalar("a")) else: fill_value = plc.interop.from_arrow(pa.scalar(1)) with cudf_raises(TypeError): - plc.copying.shift(target_column, 2, fill_value) + plc.copying.shift(plc_target_column, 2, fill_value) -def test_slice_column(target_column, pa_target_column): +def test_slice_column(target_column): + pa_target_column, plc_target_column = target_column bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] - result = plc.copying.slice(target_column, bounds) + result = plc.copying.slice(plc_target_column, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], slice_) def test_slice_column_wrong_length(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.slice(target_column, list(range(5))) + plc.copying.slice(plc_target_column, list(range(5))) def test_slice_column_decreasing(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.slice(target_column, list(range(5, -1, -1))) + plc.copying.slice(plc_target_column, list(range(5, -1, -1))) def test_slice_column_out_of_bounds(target_column): + _, plc_target_column = target_column with cudf_raises(IndexError): - plc.copying.slice(target_column, list(range(2, 8))) + plc.copying.slice(plc_target_column, list(range(2, 8))) -def test_slice_table(target_table, pa_target_table): +def test_slice_table(target_table): + pa_target_table, plc_target_table = target_table bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] - result = plc.copying.slice(target_table, bounds) + result = plc.copying.slice(plc_target_table, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): assert_table_eq(pa_target_table[lb:ub], slice_) -def test_split_column(target_column, pa_target_column): +def test_split_column(target_column): upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] - result = plc.copying.split(target_column, upper_bounds) + pa_target_column, plc_target_column = target_column + result = plc.copying.split(plc_target_column, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], split) def test_split_column_decreasing(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.split(target_column, list(range(5, -1, -1))) + plc.copying.split(plc_target_column, list(range(5, -1, -1))) def test_split_column_out_of_bounds(target_column): + _, plc_target_column = target_column with cudf_raises(IndexError): - plc.copying.split(target_column, list(range(5, 8))) + plc.copying.split(plc_target_column, list(range(5, 8))) -def test_split_table(target_table, pa_target_table): +def test_split_table(target_table): + pa_target_table, plc_target_table = target_table upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] - result = plc.copying.split(target_table, upper_bounds) + result = plc.copying.split(plc_target_table, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): assert_table_eq(pa_target_table[lb:ub], split) -def test_copy_if_else_column_column( - target_column, pa_target_column, pa_source_scalar, mask, pa_mask -): +def test_copy_if_else_column_column(target_column, mask, source_scalar): + pa_target_column, plc_target_column = target_column + pa_source_scalar, _ = source_scalar + pa_mask, plc_mask = mask + pa_other_column = pa.concat_arrays( [pa.array([pa_source_scalar] * 2), pa_target_column[:-2]] ) - other_column = plc.interop.from_arrow(pa_other_column) + plc_other_column = plc.interop.from_arrow(pa_other_column) result = plc.copying.copy_if_else( - target_column, - other_column, - mask, + plc_target_column, + plc_other_column, + plc_mask, ) expected = pc.if_else( @@ -626,46 +659,51 @@ def test_copy_if_else_column_column( def test_copy_if_else_wrong_type(target_column, mask): - if is_integer(dtype := target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow( - pa.array(["a"] * target_column.size()) + _, plc_target_column = target_column + _, plc_mask = mask + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + plc_input_column = plc.interop.from_arrow( + pa.array(["a"] * plc_target_column.size()) ) else: - input_column = plc.interop.from_arrow( - pa.array([1] * target_column.size()) + plc_input_column = plc.interop.from_arrow( + pa.array([1] * plc_target_column.size()) ) with cudf_raises(TypeError): - plc.copying.copy_if_else(input_column, target_column, mask) + plc.copying.copy_if_else(plc_input_column, plc_target_column, plc_mask) def test_copy_if_else_wrong_type_mask(target_column): + _, plc_target_column = target_column with cudf_raises(TypeError): plc.copying.copy_if_else( - target_column, - target_column, + plc_target_column, + plc_target_column, plc.interop.from_arrow( - pa.array([1.0, 2.0] * (target_column.size() // 2)) + pa.array([1.0, 2.0] * (plc_target_column.size() // 2)) ), ) def test_copy_if_else_wrong_size(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): plc.copying.copy_if_else( plc.interop.from_arrow(pa.array([1])), - target_column, + plc_target_column, plc.interop.from_arrow( - pa.array([True, False] * (target_column.size() // 2)) + pa.array([True, False] * (plc_target_column.size() // 2)) ), ) def test_copy_if_else_wrong_size_mask(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): plc.copying.copy_if_else( - target_column, - target_column, + plc_target_column, + plc_target_column, plc.interop.from_arrow(pa.array([True])), ) @@ -673,21 +711,21 @@ def test_copy_if_else_wrong_size_mask(target_column): @pytest.mark.parametrize("array_left", [True, False]) def test_copy_if_else_column_scalar( target_column, - pa_target_column, source_scalar, - pa_source_scalar, array_left, mask, - pa_mask, ): + pa_target_column, plc_target_column = target_column + pa_source_scalar, plc_source_scalar = source_scalar + pa_mask, plc_mask = mask args = ( - (target_column, source_scalar) + (plc_target_column, plc_source_scalar) if array_left - else (source_scalar, target_column) + else (plc_source_scalar, plc_target_column) ) result = plc.copying.copy_if_else( *args, - mask, + plc_mask, ) pa_args = ( @@ -704,16 +742,17 @@ def test_copy_if_else_column_scalar( def test_boolean_mask_scatter_from_table( source_table, - pa_source_table, target_table, - pa_target_table, mask, - pa_mask, ): + pa_source_table, plc_source_table = source_table + pa_target_table, plc_target_table = target_table + pa_mask, plc_mask = mask + result = plc.copying.boolean_mask_scatter( - source_table, - target_table, - mask, + plc_source_table, + plc_target_table, + plc_mask, ) if pa.types.is_list( @@ -757,28 +796,34 @@ def test_boolean_mask_scatter_from_table( def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - plc.Table(source_table.columns()[:2]), - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_target_table, plc.interop.from_arrow(pa.array([True, False] * 3)), ) def test_boolean_mask_scatter_from_wrong_mask_size(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - source_table, - target_table, + plc_source_table, + plc_target_table, plc.interop.from_arrow(pa.array([True, False] * 2)), ) def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - plc.Table(source_table.columns()[:2]), - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_target_table, plc.interop.from_arrow( pa.array([True, False] * 2 + [False, False]) ), @@ -786,44 +831,48 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): - if is_integer(dtype := target_table.columns()[0].type()) or is_floating( - dtype - ): + _, plc_target_table = target_table + _, plc_mask = mask + if is_integer( + dtype := plc_target_table.columns()[0].type() + ) or is_floating(dtype): input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.boolean_mask_scatter( - plc.Table([input_column] * 3), target_table, mask + plc.Table([input_column] * 3), plc_target_table, plc_mask ) def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(TypeError): plc.copying.boolean_mask_scatter( - source_table, - target_table, + plc_source_table, + plc_target_table, plc.interop.from_arrow(pa.array([1.0, 2.0] * 3)), ) def test_boolean_mask_scatter_from_scalars( source_scalar, - pa_source_scalar, target_table, - pa_target_table, mask, - pa_mask, ): + pa_source_scalar, plc_source_scalar = source_scalar + pa_target_table, plc_target_table = target_table + pa_mask, plc_mask = mask result = plc.copying.boolean_mask_scatter( - [source_scalar] * 3, - target_table, - mask, + [plc_source_scalar] * 3, + plc_target_table, + plc_mask, ) expected = _pyarrow_boolean_mask_scatter_table( - [pa_source_scalar] * target_table.num_columns(), + [pa_source_scalar] * plc_target_table.num_columns(), pc.invert(pa_mask), pa_target_table, ) @@ -831,9 +880,10 @@ def test_boolean_mask_scatter_from_scalars( assert_table_eq(expected, result) -def test_get_element(input_column, pa_input_column): +def test_get_element(input_column): index = 1 - result = plc.copying.get_element(input_column, index) + pa_input_column, plc_input_column = input_column + result = plc.copying.get_element(plc_input_column, index) assert ( plc.interop.to_arrow( @@ -844,5 +894,6 @@ def test_get_element(input_column, pa_input_column): def test_get_element_out_of_bounds(input_column): + _, plc_input_column = input_column with cudf_raises(IndexError): - plc.copying.get_element(input_column, 100) + plc.copying.get_element(plc_input_column, 100) diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py index a5d332a7795..13f3b037606 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -19,13 +19,9 @@ @pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]) -def pa_col_data(request, numeric_pa_type): - return pa.array(request.param, type=numeric_pa_type) - - -@pytest.fixture(scope="module") -def plc_col_data(pa_col_data): - return plc.interop.from_arrow(pa_col_data) +def col_data(request, numeric_pa_type): + pa_array = pa.array(request.param, type=numeric_pa_type) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture( @@ -60,7 +56,8 @@ def plc_tbl_data(request): @pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]]) @pytest.mark.parametrize("exact", [True, False]) -def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): +def test_quantile(col_data, interp_opt, q, exact): + pa_col_data, plc_col_data = col_data ordered_indices = plc.interop.from_arrow( pc.cast(pc.sort_indices(pa_col_data), pa.int32()) ) @@ -210,7 +207,8 @@ def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp): "q", [[0.1], (0.1,), np.array([0.1])], ) -def test_quantile_q_array_like(pa_col_data, plc_col_data, q): +def test_quantile_q_array_like(col_data, q): + pa_col_data, plc_col_data = col_data ordered_indices = plc.interop.from_arrow( pc.cast(pc.sort_indices(pa_col_data), pa.int32()) ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py index 32d79257f4f..da1157e5832 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py +++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py @@ -10,20 +10,15 @@ @pytest.fixture(scope="module") def reshape_data(): data = [[1, 2, 3], [4, 5, 6]] - return data + arrow_tbl = pa.Table.from_arrays(data, names=["a", "b"]) + return data, plc.interop.from_arrow(arrow_tbl) -@pytest.fixture(scope="module") -def reshape_plc_tbl(reshape_data): - arrow_tbl = pa.Table.from_arrays(reshape_data, names=["a", "b"]) - plc_tbl = plc.interop.from_arrow(arrow_tbl) - return plc_tbl - - -def test_interleave_columns(reshape_data, reshape_plc_tbl): +def test_interleave_columns(reshape_data): + raw_data, reshape_plc_tbl = reshape_data res = plc.reshape.interleave_columns(reshape_plc_tbl) - interleaved_data = [pa.array(pair) for pair in zip(*reshape_data)] + interleaved_data = [pa.array(pair) for pair in zip(*raw_data)] expect = pa.concat_arrays(interleaved_data) @@ -31,10 +26,11 @@ def test_interleave_columns(reshape_data, reshape_plc_tbl): @pytest.mark.parametrize("cnt", [0, 1, 3]) -def test_tile(reshape_data, reshape_plc_tbl, cnt): +def test_tile(reshape_data, cnt): + raw_data, reshape_plc_tbl = reshape_data res = plc.reshape.tile(reshape_plc_tbl, cnt) - tiled_data = [pa.array(col * cnt) for col in reshape_data] + tiled_data = [pa.array(col * cnt) for col in raw_data] expect = pa.Table.from_arrays( tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index 818d6e6e72a..c4e437fe5d9 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -8,39 +8,38 @@ @pytest.fixture(scope="module") -def pa_data(): - data = [ - "leopard", - "Golden Eagle", - "SNAKE", - "", - "!A", - "hello World", - "A B C", - "#", - "AƻB", - "Ⓑⓖ", - "Art of War", - "The quick bRoWn fox juMps over the laze DOG", - '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', - "accénted", - None, - ] - return pa.array(data) - - -@pytest.fixture(scope="module") -def plc_data(pa_data): - return plc.interop.from_arrow(pa_data) +def str_data(): + pa_data = pa.array( + [ + "leopard", + "Golden Eagle", + "SNAKE", + "", + "!A", + "hello World", + "A B C", + "#", + "AƻB", + "Ⓑⓖ", + "Art of War", + "The quick bRoWn fox juMps over the laze DOG", + '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', + "accénted", + None, + ] + ) + return pa_data, plc.interop.from_arrow(pa_data) -def test_capitalize(plc_data, pa_data): +def test_capitalize(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.capitalize(plc_data) expected = pa.compute.utf8_capitalize(pa_data) assert_column_eq(expected, got) -def test_title(plc_data, pa_data): +def test_title(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.title( plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES ) @@ -48,7 +47,8 @@ def test_title(plc_data, pa_data): assert_column_eq(expected, got) -def test_is_title(plc_data, pa_data): +def test_is_title(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.is_title(plc_data) expected = pa.compute.utf8_is_title(pa_data) assert_column_eq(expected, got) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py index 8cdb6f7c521..fc8c6656b5d 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py @@ -8,15 +8,11 @@ @pytest.fixture(scope="module") -def pa_target_col(): - return pa.array( +def target_col(): + pa_array = pa.array( ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] ) - - -@pytest.fixture(scope="module") -def plc_target_col(pa_target_col): - return plc.interop.from_arrow(pa_target_col) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture( @@ -45,9 +41,8 @@ def plc_target_pat(pa_target_scalar): return prog -def test_contains_re( - pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat -): +def test_contains_re(target_col, pa_target_scalar, plc_target_pat): + pa_target_col, plc_target_col = target_col got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) expected = pa.compute.match_substring_regex( pa_target_col, pa_target_scalar.as_py() diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py index 44900044184..95a1a3cf731 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_find.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py @@ -8,8 +8,8 @@ @pytest.fixture(scope="module") -def pa_data_col(): - return pa.array( +def data_col(): + pa_array = pa.array( [ "abc123", "ABC123", @@ -53,16 +53,12 @@ def pa_data_col(): None, ] ) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def plc_data_col(pa_data_col): - return plc.interop.from_arrow(pa_data_col) - - -@pytest.fixture(scope="module") -def pa_target_col(): - return pa.array( +def target_col(): + pa_array = pa.array( [ "a", "B", @@ -106,24 +102,18 @@ def pa_target_col(): None, # ends_with ] ) - - -@pytest.fixture(scope="module") -def plc_target_col(pa_target_col): - return plc.interop.from_arrow(pa_target_col) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(params=["a", " ", "A", "Ab", "23"], scope="module") -def pa_target_scalar(request): - return pa.scalar(request.param, type=pa.string()) - - -@pytest.fixture(scope="module") -def plc_target_scalar(pa_target_scalar): - return plc.interop.from_arrow(pa_target_scalar) +def target_scalar(request): + pa_scalar = pa.scalar(request.param, type=pa.string()) + return pa_scalar, plc.interop.from_arrow(pa_scalar) -def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): +def test_find(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar got = plc.strings.find.find(plc_data_col, plc_target_scalar, 0, -1) expected = pa.array( @@ -161,7 +151,9 @@ def handle_none(st, target): return expected -def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col): +def test_find_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = pa.array( [ elem.find(target) if not (elem is None or target is None) else None @@ -177,7 +169,9 @@ def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col): assert_column_eq(expected, got) -def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): +def test_rfind(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.rfind(plc_data_col, plc_target_scalar, 0, -1) @@ -195,9 +189,9 @@ def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): assert_column_eq(expected, got) -def test_contains( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_contains(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.contains(plc_data_col, plc_target_scalar) @@ -214,9 +208,9 @@ def test_contains( assert_column_eq(expected, got) -def test_contains_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_contains_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: target in st ) @@ -224,18 +218,18 @@ def test_contains_column( assert_column_eq(expected, got) -def test_starts_with( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_starts_with(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar) expected = pa.compute.starts_with(pa_data_col, py_target) assert_column_eq(expected, got) -def test_starts_with_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_starts_with_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: st.startswith(target) ) @@ -243,18 +237,18 @@ def test_starts_with_column( assert_column_eq(expected, got) -def test_ends_with( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_ends_with(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar) expected = pa.compute.ends_with(pa_data_col, py_target) assert_column_eq(expected, got) -def test_ends_with_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_ends_with_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: st.endswith(target) ) From 22ac996dea6f297736c9fd8cda735c0e7a5dbe43 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 11 Jun 2024 16:30:09 +0100 Subject: [PATCH 078/340] Remove `Scalar` container type from polars interpreter (#15953) Now we always return columns and, where usage of a scalar might be correct (for example broadcasting in binops), we check if the column is "actually" a scalar and extract it. This is slightly annoying because we have to introspect things in various places. But without changing libcudf to treat length-1 columns as always broadcastable like scalars this is, I think, the best we can do. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15953 --- python/cudf_polars/cudf_polars/__init__.py | 8 +- .../cudf_polars/containers/__init__.py | 3 +- .../cudf_polars/containers/column.py | 28 ++++- .../cudf_polars/containers/dataframe.py | 6 +- .../cudf_polars/containers/scalar.py | 23 ---- python/cudf_polars/cudf_polars/dsl/expr.py | 114 +++++++++++------- python/cudf_polars/cudf_polars/dsl/ir.py | 75 +++++++++--- .../cudf_polars/cudf_polars/dsl/translate.py | 4 +- .../cudf_polars/cudf_polars/utils/sorting.py | 2 +- python/cudf_polars/pyproject.toml | 3 - python/cudf_polars/tests/utils/__init__.py | 6 + .../cudf_polars/tests/utils/test_broadcast.py | 74 ++++++++++++ 12 files changed, 249 insertions(+), 97 deletions(-) delete mode 100644 python/cudf_polars/cudf_polars/containers/scalar.py create mode 100644 python/cudf_polars/tests/utils/__init__.py create mode 100644 python/cudf_polars/tests/utils/test_broadcast.py diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index b19a282129a..41d06f8631b 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -10,7 +10,13 @@ from __future__ import annotations +from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir -__all__: list[str] = ["execute_with_cudf", "translate_ir"] +__all__: list[str] = [ + "execute_with_cudf", + "translate_ir", + "__git_commit__", + "__version__", +] diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index ee69e748eb5..06bb08953f1 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,8 +5,7 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column", "NamedColumn", "Scalar"] +__all__: list[str] = ["DataFrame", "Column", "NamedColumn"] from cudf_polars.containers.column import Column, NamedColumn from cudf_polars.containers.dataframe import DataFrame -from cudf_polars.containers.scalar import Scalar diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 575d15d3ece..156dd395d64 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -17,12 +17,13 @@ class Column: - """A column with sortedness metadata.""" + """An immutable column with sortedness metadata.""" obj: plc.Column is_sorted: plc.types.Sorted order: plc.types.Order null_order: plc.types.NullOrder + is_scalar: bool def __init__( self, @@ -33,10 +34,33 @@ def __init__( null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, ): self.obj = column + self.is_scalar = self.obj.size() == 1 + if self.obj.size() <= 1: + is_sorted = plc.types.Sorted.YES self.is_sorted = is_sorted self.order = order self.null_order = null_order + @functools.cached_property + def obj_scalar(self) -> plc.Scalar: + """ + A copy of the column object as a pylibcudf Scalar. + + Returns + ------- + pylibcudf Scalar object. + + Raises + ------ + ValueError + If the column is not length-1. + """ + if not self.is_scalar: + raise ValueError( + f"Cannot convert a column of length {self.obj.size()} to scalar" + ) + return plc.copying.get_element(self.obj, 0) + def sorted_like(self, like: Column, /) -> Self: """ Copy sortedness properties from a column onto self. @@ -81,6 +105,8 @@ def set_sorted( ------- Self with metadata set. """ + if self.obj.size() <= 1: + is_sorted = plc.types.Sorted.YES self.is_sorted = is_sorted self.order = order self.null_order = null_order diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index ac7e748095e..7039fcaf077 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -32,7 +32,7 @@ class DataFrame: """A representation of a dataframe.""" columns: list[NamedColumn] - table: plc.Table | None + table: plc.Table def __init__(self, columns: Sequence[NamedColumn]) -> None: self.columns = list(columns) @@ -41,7 +41,7 @@ def __init__(self, columns: Sequence[NamedColumn]) -> None: def copy(self) -> Self: """Return a shallow copy of self.""" - return type(self)(self.columns) + return type(self)([c.copy() for c in self.columns]) def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" @@ -70,8 +70,6 @@ def num_columns(self) -> int: @cached_property def num_rows(self) -> int: """Number of rows.""" - if self.table is None: - raise ValueError("Number of rows of frame with scalars makes no sense") return self.table.num_rows() @classmethod diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py deleted file mode 100644 index fc97d0fd9c2..00000000000 --- a/python/cudf_polars/cudf_polars/containers/scalar.py +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: Apache-2.0 - -"""A scalar, with some properties.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - import cudf._lib.pylibcudf as plc - -__all__: list[str] = ["Scalar"] - - -class Scalar: - """A scalar, and a name.""" - - __slots__ = ("obj", "name") - obj: plc.Scalar - - def __init__(self, scalar: plc.Scalar): - self.obj = scalar diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 6d9435ce373..a81cdcbf0c3 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -5,7 +5,7 @@ """ DSL nodes for the polars expression language. -An expression node is a function, `DataFrame -> Column` or `DataFrame -> Scalar`. +An expression node is a function, `DataFrame -> Column`. The evaluation context is provided by a LogicalPlan node, and can affect the evaluation rule as well as providing the dataframe input. @@ -26,7 +26,7 @@ import cudf._lib.pylibcudf as plc -from cudf_polars.containers import Column, NamedColumn, Scalar +from cudf_polars.containers import Column, NamedColumn from cudf_polars.utils import sorting if TYPE_CHECKING: @@ -165,7 +165,7 @@ def do_evaluate( *, context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, - ) -> Column: # TODO: return type is a lie for Literal + ) -> Column: """ Evaluate this expression given a dataframe for context. @@ -187,8 +187,7 @@ def do_evaluate( Returns ------- - Column representing the evaluation of the expression (or maybe - a scalar). + Column representing the evaluation of the expression. Raises ------ @@ -205,7 +204,7 @@ def evaluate( *, context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, - ) -> Column: # TODO: return type is a lie for Literal + ) -> Column: """ Evaluate this expression given a dataframe for context. @@ -222,23 +221,13 @@ def evaluate( Notes ----- - Individual subclasses should implement :meth:`do_allocate`, + Individual subclasses should implement :meth:`do_evaluate`, this method provides logic to handle lookups in the substitution mapping. - The typed return value of :class:`Column` is not true when - evaluating :class:`Literal` nodes (which instead produce - :class:`Scalar` objects). However, these duck-type to having a - pylibcudf container object inside them, and usually they end - up appearing in binary expressions which pylibcudf handles - appropriately since there are overloads for (column, scalar) - pairs. We don't have to handle (scalar, scalar) in binops - since the polars optimizer has a constant-folding pass. - Returns ------- - Column representing the evaluation of the expression (or maybe - a scalar). + Column representing the evaluation of the expression. Raises ------ @@ -319,24 +308,35 @@ def evaluate( context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, ) -> NamedColumn: - """Evaluate this expression given a dataframe for context.""" + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame providing context + context + Execution context + mapping + Substitution mapping + + Returns + ------- + NamedColumn attaching a name to an evaluated Column + + See Also + -------- + :meth:`Expr.evaluate` for details, this function just adds the + name to a column produced from an expression. + """ obj = self.value.evaluate(df, context=context, mapping=mapping) - if isinstance(obj, Scalar): - return NamedColumn( - plc.Column.from_scalar(obj.obj, 1), - self.name, - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.BEFORE, - ) - else: - return NamedColumn( - obj.obj, - self.name, - is_sorted=obj.is_sorted, - order=obj.order, - null_order=obj.null_order, - ) + return NamedColumn( + obj.obj, + self.name, + is_sorted=obj.is_sorted, + order=obj.order, + null_order=obj.null_order, + ) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" @@ -363,7 +363,7 @@ def do_evaluate( ) -> Column: """Evaluate this expression given a dataframe for context.""" # datatype of pyarrow scalar is correct by construction. - return Scalar(plc.interop.from_arrow(self.value)) # type: ignore + return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1)) class Col(Expr): @@ -402,8 +402,14 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - # TODO: type is wrong, and dtype - return df.num_rows # type: ignore + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" @@ -664,10 +670,24 @@ def do_evaluate( return Column(plc.strings.case.to_upper(column.obj)) elif self.name == pl_expr.StringFunction.EndsWith: column, suffix = columns - return Column(plc.strings.find.ends_with(column.obj, suffix.obj)) + return Column( + plc.strings.find.ends_with( + column.obj, + suffix.obj_scalar + if column.obj.size() != suffix.obj.size() and suffix.is_scalar + else suffix.obj, + ) + ) elif self.name == pl_expr.StringFunction.StartsWith: - column, suffix = columns - return Column(plc.strings.find.starts_with(column.obj, suffix.obj)) + column, prefix = columns + return Column( + plc.strings.find.starts_with( + column.obj, + prefix.obj_scalar + if column.obj.size() != prefix.obj.size() and prefix.is_scalar + else prefix.obj, + ) + ) else: raise NotImplementedError(f"StringFunction {self.name}") @@ -875,9 +895,6 @@ def __init__( self, dtype: plc.DataType, name: str, options: Any, value: Expr ) -> None: super().__init__(dtype) - # TODO: fix polars name - if name == "nunique": - name = "n_unique" self.name = name self.options = options self.children = (value,) @@ -1092,8 +1109,15 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ) + lop = left.obj + rop = right.obj + if left.obj.size() != right.obj.size(): + if left.is_scalar: + lop = left.obj_scalar + elif right.is_scalar: + rop = right.obj_scalar return Column( - plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype), + plc.binaryop.binary_operation(lop, rop, self.op, self.dtype), ) def collect_agg(self, *, depth: int) -> AggInfo: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 665bbe5be41..0a6deb5698c 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -63,26 +63,58 @@ def broadcast( *columns: NamedColumn, target_length: int | None = None ) -> list[NamedColumn]: - lengths = {column.obj.size() for column in columns} - if len(lengths - {1}) > 1: - raise RuntimeError("Mismatching column lengths") + """ + Broadcast a sequence of columns to a common length. + + Parameters + ---------- + columns + Columns to broadcast. + target_length + Optional length to broadcast to. If not provided, uses the + non-unit length of existing columns. + + Returns + ------- + List of broadcasted columns all of the same length. + + Raises + ------ + RuntimeError + If broadcasting is not possible. + + Notes + ----- + In evaluation of a set of expressions, polars type-puns length-1 + columns with scalars. When we insert these into a DataFrame + object, we need to ensure they are of equal length. This function + takes some columns, some of which may be length-1 and ensures that + all length-1 columns are broadcast to the length of the others. + + Broadcasting is only possible if the set of lengths of the input + columns is a subset of ``{1, n}`` for some (fixed) ``n``. If + ``target_length`` is provided and not all columns are length-1 + (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``. + """ + lengths: set[int] = {column.obj.size() for column in columns} if lengths == {1}: if target_length is None: return list(columns) nrows = target_length - elif len(lengths) == 1: - if target_length is not None: - assert target_length in lengths - return list(columns) else: - (nrows,) = lengths - {1} - if target_length is not None: - assert target_length == nrows + try: + (nrows,) = lengths.difference([1]) + except ValueError as e: + raise RuntimeError("Mismatching column lengths") from e + if target_length is not None and nrows != target_length: + raise RuntimeError( + f"Cannot broadcast columns of length {nrows=} to {target_length=}" + ) return [ column if column.obj.size() != 1 else NamedColumn( - plc.Column.from_scalar(plc.copying.get_element(column.obj, 0), nrows), + plc.Column.from_scalar(column.obj_scalar, nrows), column.name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, @@ -279,12 +311,16 @@ class Select(IR): """Input dataframe.""" expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" + should_broadcast: bool + """Should columns be broadcast?""" def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) # Handle any broadcasting - columns = broadcast(*(e.evaluate(df) for e in self.expr)) + columns = [e.evaluate(df) for e in self.expr] + if self.should_broadcast: + columns = broadcast(*columns) return DataFrame(columns) @@ -587,15 +623,24 @@ class HStack(IR): """Input dataframe.""" columns: list[expr.NamedExpr] """List of expressions to produce new columns.""" + should_broadcast: bool + """Should columns be broadcast?""" def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) columns = [c.evaluate(df) for c in self.columns] - # TODO: a bit of a hack, should inherit the should_broadcast - # property of polars' ProjectionOptions on the hstack node. - if not any(e.name.startswith("__POLARS_CSER_0x") for e in self.columns): + if self.should_broadcast: columns = broadcast(*columns, target_length=df.num_rows) + else: + # Polars ensures this is true, but let's make sure nothing + # went wrong. In this case, the parent node is a + # guaranteed to be a Select which will take care of making + # sure that everything is the same length. The result + # table that might have mismatching column lengths will + # never be turned into a pylibcudf Table with all columns + # by the Select, which is why this is safe. + assert all(e.name.startswith("__POLARS_CSER_0x") for e in self.columns) return df.with_columns(columns) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 38107023365..adde3b1a9dc 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -122,7 +122,7 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.expr] - return ir.Select(schema, inp, exprs) + return ir.Select(schema, inp, exprs, node.should_broadcast) @_translate_ir.register @@ -166,7 +166,7 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] - return ir.HStack(schema, inp, exprs) + return ir.HStack(schema, inp, exprs, node.should_broadcast) @_translate_ir.register diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py index d35459db20d..24fd449dd88 100644 --- a/python/cudf_polars/cudf_polars/utils/sorting.py +++ b/python/cudf_polars/cudf_polars/utils/sorting.py @@ -30,7 +30,7 @@ def sort_order( Returns ------- - tuple of column_order and null_precendence + tuple of column_order and null_precedence suitable for passing to sort routines """ # Mimicking polars broadcast handling of descending diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 2faf8c3193f..11178a3be74 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -49,9 +49,6 @@ license-files = ["LICENSE"] [tool.setuptools.dynamic] version = {file = "cudf_polars/VERSION"} -[tool.setuptools.packages.find] -exclude = ["*tests*"] - [tool.pytest.ini_options] xfail_strict = true diff --git a/python/cudf_polars/tests/utils/__init__.py b/python/cudf_polars/tests/utils/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/utils/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py new file mode 100644 index 00000000000..69ad1e519e2 --- /dev/null +++ b/python/cudf_polars/tests/utils/test_broadcast.py @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import cudf._lib.pylibcudf as plc + +from cudf_polars.containers import NamedColumn +from cudf_polars.dsl.ir import broadcast + + +@pytest.mark.parametrize("target", [4, None]) +def test_broadcast_all_scalar(target): + columns = [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID + ), + f"col{i}", + ) + for i in range(3) + ] + result = broadcast(*columns, target_length=target) + expected = 1 if target is None else target + + assert all(column.obj.size() == expected for column in result) + + +def test_invalid_target_length(): + columns = [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID + ), + f"col{i}", + ) + for i in range(3) + ] + with pytest.raises(RuntimeError): + _ = broadcast(*columns, target_length=8) + + +def test_broadcast_mismatching_column_lengths(): + columns = [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID + ), + f"col{i}", + ) + for i in range(3) + ] + with pytest.raises(RuntimeError): + _ = broadcast(*columns) + + +@pytest.mark.parametrize("nrows", [0, 5]) +def test_broadcast_with_scalars(nrows): + columns = [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), + nrows if i == 0 else 1, + plc.MaskState.ALL_VALID, + ), + f"col{i}", + ) + for i in range(3) + ] + + result = broadcast(*columns) + assert all(column.obj.size() == nrows for column in result) From 8efa64ea61905969423bbfcc11353817c7cc1bca Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 11 Jun 2024 11:31:20 -0500 Subject: [PATCH 079/340] Fix `dask_cudf.read_parquet` regression for legacy timestamp data (#15929) cudf does not currently support timezone-aware datetime columns. For example: ```python pdf = pd.DataFrame( { "time": pd.to_datetime( ["1996-01-02", "1996-12-01"], utc=True, ), "x": [1, 2], } ) cudf.DataFrame.from_pandas(pdf) ``` ``` NotImplementedError: cuDF does not yet support timezone-aware datetimes ``` However, `cudf.read_parquet` **does** allow you to read this same data from a Parquet file. This PR adds a simple fix to allow the same data to be read with `dask_cudf`. The dask_cudf version was previously "broken" because it relies on upstream pyarrow logic to construct `meta` as a pandas DataFrame (and then we just convert `meta` from pandas to cudf). As illustrated in the example above, this direct conversion is not allowed when one or more columns contain timezone information. **Important Context** The actual motivation for this PR is to fix a **regression** in 24.06+ for older parquet files containing "legacy" timestamp types (e.g. `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS`). In `pyarrow 14.0.2` (used by cudf-24.04), these legacy types were not automatically translated to timezone-aware dtypes by pyarrow. In `pyarrow 16.1.0` (used by cudf-24.06+), the legacy types **ARE** automatically translated. Therefore, in moving from cudf-24.04 to cudf-24.06+, some `dask_cudf` users will find that they can no longer read the same parquet file containing legacy timestamp data. I'm not entirely sure if cudf should always allow users to read Parquet data with timezone-aware dtypes (e.g. if the timezone is **not** utc), but it definitely makes sense for cudf to ignore automatic/unnecessary timezone translations. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15929 --- python/dask_cudf/dask_cudf/io/parquet.py | 5 +++++ python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 9 ++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index fc962670c47..ba8b1e89721 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -6,6 +6,7 @@ from io import BufferedWriter, BytesIO, IOBase import numpy as np +import pandas as pd from pyarrow import dataset as pa_ds, parquet as pq from dask import dataframe as dd @@ -41,6 +42,10 @@ def _create_dd_meta(cls, dataset_info, **kwargs): meta_pd = super()._create_dd_meta(dataset_info, **kwargs) # Convert to cudf + # (drop unsupported timezone information) + for k, v in meta_pd.dtypes.items(): + if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None: + meta_pd[k] = meta_pd[k].dt.tz_localize(None) meta_cudf = cudf.from_pandas(meta_pd) # Re-set "object" dtypes to align with pa schema diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index f3e3911e6c7..620a917109e 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -610,9 +610,8 @@ def test_timezone_column(tmpdir): } ) pdf.to_parquet(path) + + # Check that `cudf` and `dask_cudf` results match got = dask_cudf.read_parquet(path) - # cudf.read_parquet does not support reading timezone aware types yet - assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC") - got["time"] = got["time"].astype("datetime64[ns]") - expected = cudf.read_parquet(path) - dd.assert_eq(got, expected) + expect = cudf.read_parquet(path) + dd.assert_eq(got, expect) From d844d670dfbfcbaeb673253f762bed7fbebf6c86 Mon Sep 17 00:00:00 2001 From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com> Date: Tue, 11 Jun 2024 13:05:01 -0400 Subject: [PATCH 080/340] Project automation bug fixes (#15971) ## Description This PR resolves two bugs in the recent pr #15945 ## external issue labeling Recent runs show that it is labeling [issues created](https://github.com/rapidsai/cudf/issues/15967) by team members as `External` Using graphQL to explore the authorAssociation shows `"authorAssociation": "MEMBER"` - I've updated the permissions to be specific to the job in an attempt to ensure that we have the permissions we need. Testing this action in personal repos shows it works as expected so not 100% on what's going on. A PR was also unable to run due to the token only having read permissions, so hopefully this is a two birds one stone fix. It may be beneficial to re-run https://github.com/rapidsai/cudf/actions/runs/9462546964/job/26065765728 with debug mode on to see if `author_association` is different to the action (which would be concerning) *edit test* ## project automation This fixes the workflow incorrectly calling my personal workflows for testing. ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [ ] ~New or existing tests cover these changes.~ - [ ] ~The documentation is up to date with these changes.~ --- .github/workflows/external_issue_labeler.yml | 25 +++++++++++-------- .../workflows/pr_issue_status_automation.yml | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/workflows/external_issue_labeler.yml b/.github/workflows/external_issue_labeler.yml index e6d987e9f34..81bc9b18296 100644 --- a/.github/workflows/external_issue_labeler.yml +++ b/.github/workflows/external_issue_labeler.yml @@ -20,36 +20,41 @@ on: types: - opened - pull_request: + pull_request_target: types: - opened env: GITHUB_TOKEN: ${{ github.token }} -permissions: - issues: write - pull-requests: write - jobs: Label-Issue: runs-on: ubuntu-latest - # Only run if the issue author is not part of RAPIDS - if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}} + permissions: + issues: write + if: github.event_name == 'issues' steps: - name: add-external-labels + # Only run if the issue author is not part of RAPIDS + if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}} run: | + echo ${{ github.event.issue.author_association }} issue_url=${{ github.event.issue.html_url }} gh issue edit ${issue_url} --add-label "External" continue-on-error: true Label-PR: runs-on: ubuntu-latest - # Only run if the issue author is not part of RAPIDS - if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}} + permissions: + pull-requests: write + issues: write + if: github.event_name == 'pull_request_target' steps: - name: add-external-labels + # Only run if the issue author is not part of RAPIDS + if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}} run: | + echo ${{ github.event.pull_request.author_association }} pr_url=${{ github.event.pull_request.html_url }} gh issue edit ${pr_url} --add-label "External" - continue-on-error: true + continue-on-error: true diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index aaece1bfa3e..837963c3286 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: jarmak-nv/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 if: github.event.pull_request.state == 'open' needs: get-project-id with: From dfa79d457138dcb9a70410e06c77c45a63ae0b25 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:58:06 -0400 Subject: [PATCH 081/340] Add a developer check for proxy objects (#15956) Closes #15864 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15956 --- docs/cudf/source/developer_guide/cudf_pandas.md | 9 +++++++++ python/cudf/cudf/pandas/__init__.py | 5 +++-- python/cudf/cudf/pandas/fast_slow_proxy.py | 14 ++++++++++++++ .../cudf/cudf_pandas_tests/test_cudf_pandas.py | 16 +++++++++++++++- 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index 827ba18a4a4..a8a6d81d6fb 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -20,6 +20,7 @@ The "wrapped" types/classes are the Pandas and cuDF specific types that have bee Wrapped objects and proxy objects are instances of wrapped types and proxy types, respectively. In the snippet below `s1` and `s2` are wrapped objects and `s3` is a fast-slow proxy object. Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas modules as attributes. +To check if an object is a proxy type, we can use `cudf.pandas.is_proxy_object`. ```python import cudf.pandas cudf.pandas.install() @@ -31,6 +32,14 @@ Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas s1 = cudf.Series([1,2]) s2 = pd.Series([1,2]) s3 = xpd.Series([1,2]) + + from cudf.pandas import is_proxy_object + + is_proxy_object(s1) # returns False + + is_proxy_object(s2) # returns False + + is_proxy_object(s3) # returns True ``` ```{note} diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index f2e855ae55c..5b3785531d3 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -1,11 +1,12 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from .fast_slow_proxy import is_proxy_object from .magics import load_ipython_extension from .profiler import Profiler -__all__ = ["Profiler", "load_ipython_extension", "install"] +__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"] LOADED = False diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 5f4cf2e6cc6..128913e5746 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1185,6 +1185,20 @@ def _replace_closurevars( ) +def is_proxy_object(obj: Any) -> bool: + """Determine if an object is proxy object + + Parameters + ---------- + obj : object + Any python object. + + """ + if _FastSlowProxyMeta in type(type(obj)).__mro__: + return True + return False + + NUMPY_TYPES: Set[str] = set(np.sctypeDict.values()) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 72e9ad5fca3..515a4714a5a 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -20,7 +20,7 @@ from pytz import utc from cudf.pandas import LOADED, Profiler -from cudf.pandas.fast_slow_proxy import _Unusable +from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object if not LOADED: raise ImportError("These tests must be run with cudf.pandas loaded") @@ -1488,3 +1488,17 @@ def mock_mean_none(self, *args, **kwargs): def test_excelwriter_pathlike(): assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike) + + +def test_is_proxy_object(): + np_arr = np.array([1]) + + s1 = xpd.Series([1]) + s2 = pd.Series([1]) + + np_arr_proxy = s1.to_numpy() + + assert not is_proxy_object(np_arr) + assert is_proxy_object(np_arr_proxy) + assert is_proxy_object(s1) + assert not is_proxy_object(s2) From f655602ecd8f254dfcee5eb0c790bd3336e83d7c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 11 Jun 2024 15:59:20 -0700 Subject: [PATCH 082/340] Fix Cython typo preventing proper inheritance (#15978) #15831 added new inheritance patterns to the Parquet options classes, but mirroring them perfectly in Cython proved problematic due to what appeared to be issues with Cython parsing of CRTP and inheritance. A deeper investigation revealed that the underlying issue was https://github.com/cython/cython/issues/6238. This PR applies the appropriate fix. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Thomas Li (https://github.com/lithomas1) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15978 --- .../_lib/pylibcudf/libcudf/io/parquet.pxd | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd index 36654457995..0ef6553db56 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd @@ -123,7 +123,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: ) except + cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]: - parquet_writer_options_builder() except + + parquet_writer_options_builder_base() except + BuilderT& metadata( cudf_io_types.table_input_metadata m @@ -164,22 +164,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: BuilderT& dictionary_policy( cudf_io_types.dictionary_policy val ) except + - # FIXME: the following two functions actually belong in - # parquet_writer_options_builder, but placing them there yields a - # "'parquet_writer_options_builder' is not a type identifier" error. - # This is probably a bug in cython since a simpler CRTP example that - # has methods returning references to a child class seem to work. - # Calling these from the chunked options builder will fail at compile - # time, so this should be safe. - # NOTE: these two are never actually called from libcudf. Instead these - # properties are set in the options after calling build(), so perhaps - # they can be removed. - BuilderT& partitions( - vector[cudf_io_types.partition_info] partitions - ) except + - BuilderT& column_chunks_file_paths( - vector[string] column_chunks_file_paths - ) except + OptionsT build() except + cdef cppclass parquet_writer_options_builder( @@ -190,6 +174,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.sink_info sink_, cudf_table_view.table_view table_ ) except + + parquet_writer_options_builder& partitions( + vector[cudf_io_types.partition_info] partitions + ) except + + parquet_writer_options_builder& column_chunks_file_paths( + vector[string] column_chunks_file_paths + ) except + cdef unique_ptr[vector[uint8_t]] write_parquet( parquet_writer_options args From 49e2a565ffb85479589406f622c74116d7f891c7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 11 Jun 2024 20:27:54 -0400 Subject: [PATCH 083/340] Support large strings in cudf::io::text::multibyte_split (#15947) Replaces int32 type used for building offsets in `cudf::io::text::multibyte_split()` to use the offsetalator instead. This allows creating large strings columns from input text files. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mark Harris (https://github.com/harrism) - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15947 --- cpp/src/io/text/multibyte_split.cu | 38 ++++++++++++++++-------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 976d735e010..9c406369068 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include @@ -518,32 +520,37 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source bool const insert_end = not(last_row_offset.has_value() or (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset)); - rmm::device_uvector offsets{ - global_offsets.size() + insert_begin + insert_end, stream, mr}; - if (insert_begin) { offsets.set_element_to_zero_async(0, stream); } - if (insert_end) { - offsets.set_element(offsets.size() - 1, chunk_offset - *first_row_offset, stream); - } + auto const chars_bytes = chunk_offset - *first_row_offset; + auto offsets = cudf::strings::detail::create_offsets_child_column( + chars_bytes, global_offsets.size() + insert_begin + insert_end, stream, mr); + auto offsets_itr = + cudf::detail::offsetalator_factory::make_output_iterator(offsets->mutable_view()); + auto set_offset_value = [offsets_itr, stream](size_type index, int64_t value) { + cudf::detail::device_single_thread( + [offsets_itr, index, value] __device__() mutable { offsets_itr[index] = value; }, stream); + }; + if (insert_begin) { set_offset_value(0, 0); } + if (insert_end) { set_offset_value(offsets->size() - 1, chars_bytes); } thrust::transform(rmm::exec_policy(stream), global_offsets.begin(), global_offsets.end(), - offsets.begin() + insert_begin, - cuda::proclaim_return_type( + offsets_itr + insert_begin, + cuda::proclaim_return_type( [baseline = *first_row_offset] __device__(byte_offset global_offset) { - return static_cast(global_offset - baseline); + return (global_offset - baseline); })); - auto string_count = offsets.size() - 1; + auto string_count = offsets->size() - 1; if (strip_delimiters) { auto it = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type>( - [ofs = offsets.data(), + [ofs = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()), chars = chars.data(), delim_size = static_cast(delimiter.size()), last_row = static_cast(string_count) - 1, insert_end] __device__(size_type row) { auto const begin = ofs[row]; - auto const len = ofs[row + 1] - begin; + auto const len = static_cast(ofs[row + 1] - begin); if (row == last_row && insert_end) { return thrust::make_pair(chars + begin, len); } else { @@ -552,12 +559,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source })); return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr); } else { - return cudf::make_strings_column( - string_count, - std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - chars.release(), - 0, - {}); + return cudf::make_strings_column(string_count, std::move(offsets), chars.release(), 0, {}); } } From d2cd1d4411e1a16f5c989efff07643ca3411f8ab Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 11 Jun 2024 20:28:40 -0400 Subject: [PATCH 084/340] Migrate lists/combine to pylibcudf (#15928) Part of #15162. concatenate_rows, concatenate_list_elements Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/15928 --- python/cudf/cudf/_lib/lists.pyx | 46 ++++---------- python/cudf/cudf/_lib/pylibcudf/lists.pxd | 7 +++ python/cudf/cudf/_lib/pylibcudf/lists.pyx | 61 +++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_lists.py | 46 ++++++++++++++ 4 files changed, 127 insertions(+), 33 deletions(-) create mode 100644 python/cudf/cudf/pylibcudf_tests/test_lists.py diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 656d92c1a4b..5d406f5c85f 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -9,11 +9,6 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( - concatenate_list_elements as cpp_concatenate_list_elements, - concatenate_null_policy, - concatenate_rows as cpp_concatenate_rows, -) from cudf._lib.pylibcudf.libcudf.lists.contains cimport ( contains, index_of as cpp_index_of, @@ -32,7 +27,6 @@ from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( distinct as cpp_distinct, ) from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport ( nan_equality, null_equality, @@ -41,10 +35,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( size_type, ) from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib import pylibcudf @@ -223,31 +214,20 @@ def index_of_column(Column col, Column search_keys): @acquire_spill_lock() def concatenate_rows(list source_columns): - cdef unique_ptr[column] c_result - - cdef table_view c_table_view = table_view_from_columns(source_columns) - - with nogil: - c_result = move(cpp_concatenate_rows( - c_table_view, - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + pylibcudf.lists.concatenate_rows( + pylibcudf.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]) + ) + ) @acquire_spill_lock() def concatenate_list_elements(Column input_column, dropna=False): - cdef concatenate_null_policy policy = ( - concatenate_null_policy.IGNORE if dropna - else concatenate_null_policy.NULLIFY_OUTPUT_ROW + return Column.from_pylibcudf( + pylibcudf.lists.concatenate_list_elements( + input_column.to_pylibcudf(mode="read"), + dropna, + ) ) - cdef column_view c_input = input_column.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_concatenate_list_elements( - c_input, - policy - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index b780d299977..2d2a5b2a9ea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -1,8 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool + from cudf._lib.pylibcudf.libcudf.types cimport size_type +from .column cimport Column from .table cimport Table cpdef Table explode_outer(Table, size_type explode_column_idx) + +cpdef Column concatenate_rows(Table) + +cpdef Column concatenate_list_elements(Column, bool dropna) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 654f39742b6..069c9da31c2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -1,12 +1,20 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode +from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( + concatenate_list_elements as cpp_concatenate_list_elements, + concatenate_null_policy, + concatenate_rows as cpp_concatenate_rows, +) from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type +from .column cimport Column from .table cimport Table @@ -33,3 +41,56 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx)) return Table.from_libcudf(move(c_result)) + + +cpdef Column concatenate_rows(Table input): + """Concatenate multiple lists columns into a single lists column row-wise. + + Parameters + ---------- + input : Table + The input table + + Returns + ------- + Table + A new Column of concatenated rows + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_concatenate_rows(input.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column concatenate_list_elements(Column input, bool dropna): + """Concatenate multiple lists on the same row into a single list. + + Parameters + ---------- + input : Column + The input column + + Returns + ------- + Column + A new Column of concatenated list elements + dropna : bool + If true, null list elements will be ignored + from concatenation. Otherwise any input null values will result in + the corresponding output row being set to null. + """ + cdef concatenate_null_policy null_policy = ( + concatenate_null_policy.IGNORE if dropna + else concatenate_null_policy.NULLIFY_OUTPUT_ROW + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_concatenate_list_elements( + input.view(), + null_policy, + )) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py new file mode 100644 index 00000000000..b21af8ea11c --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +from cudf._lib import pylibcudf as plc + + +def test_concatenate_rows(): + test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]] + + arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"]) + plc_tbl = plc.interop.from_arrow(arrow_tbl) + + res = plc.lists.concatenate_rows(plc_tbl) + + expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)]) + + assert_column_eq(expect, res) + + +@pytest.mark.parametrize( + "test_data, dropna, expected", + [ + ( + [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]], + False, + [[1, 2, 3, 4, 5], None], + ), + ( + [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]], + True, + [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]], + ), + ], +) +def test_concatenate_list_elements(test_data, dropna, expected): + arr = pa.array(test_data) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.concatenate_list_elements(plc_column, dropna) + + expect = pa.array(expected) + + assert_column_eq(expect, res) From f7ba6ab47ac994e6a1363119c01eee5dd6304181 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 11 Jun 2024 17:47:19 -0700 Subject: [PATCH 085/340] Pinned vector factory that uses the global pool (#15895) closes https://github.com/rapidsai/cudf/issues/15612 Expanded the set of vector factories to cover pinned vectors. The functions return `cudf::detail::host_vector`, which use a type-erased allocator, allowing us to utilize the runtime configurable global pinned (previously host) resource. The `pinned_host_vector` type has been removed as it can only support the non-pooled pinned allocations. Its use is not replaced with `cudf::detail::host_vector`. Moved the global host (now pinned) resource out of cuIO and changed the type to host_device. User-specified resources are now required to allocate device-accessible memory. The name has been changed to pinned to reflect the new requirement. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Alessandro Bellina (https://github.com/abellina) - Yunsong Wang (https://github.com/PointKernel) - Mark Harris (https://github.com/harrism) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15895 --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/fixture/nvbench_fixture.hpp | 13 +- cpp/benchmarks/io/cuio_common.cpp | 12 + cpp/benchmarks/io/cuio_common.hpp | 4 +- .../io/parquet/parquet_reader_multithread.cpp | 2 +- cpp/benchmarks/io/text/multibyte_split.cpp | 10 +- .../{rmm_host_vector.hpp => host_vector.hpp} | 18 +- .../detail/utilities/pinned_host_vector.hpp | 216 ------------------ .../detail/utilities/vector_factories.hpp | 38 ++- cpp/include/cudf/io/memory_resource.hpp | 65 ------ cpp/include/cudf/utilities/pinned_memory.hpp | 58 +++++ cpp/src/io/csv/reader_impl.cu | 1 + cpp/src/io/orc/reader_impl_chunking.cu | 1 + cpp/src/io/orc/writer_impl.cu | 5 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 2 + cpp/src/io/parquet/writer_impl.cu | 3 +- cpp/src/io/text/bgzip_data_chunk_source.cu | 16 +- .../io/text/data_chunk_source_factories.cpp | 51 ++--- cpp/src/io/utilities/config_utils.cpp | 214 +---------------- cpp/src/io/utilities/hostdevice_vector.hpp | 9 +- cpp/src/utilities/pinned_memory.cpp | 216 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 5 +- cpp/tests/io/json_test.cpp | 6 +- .../utilities_tests/io_utilities_tests.cpp | 45 ---- .../utilities_tests/pinned_memory_tests.cpp | 65 ++++++ .../java/ai/rapids/cudf/PinnedMemoryPool.java | 12 +- java/src/main/java/ai/rapids/cudf/Rmm.java | 2 +- java/src/main/native/src/RmmJni.cpp | 34 +-- 28 files changed, 487 insertions(+), 637 deletions(-) rename cpp/include/cudf/detail/utilities/{rmm_host_vector.hpp => host_vector.hpp} (93%) delete mode 100644 cpp/include/cudf/detail/utilities/pinned_host_vector.hpp delete mode 100644 cpp/include/cudf/io/memory_resource.hpp create mode 100644 cpp/include/cudf/utilities/pinned_memory.hpp create mode 100644 cpp/src/utilities/pinned_memory.cpp create mode 100644 cpp/tests/utilities_tests/pinned_memory_tests.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ca85996b990..aab0a9b2d49 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -664,6 +664,7 @@ add_library( src/utilities/default_stream.cpp src/utilities/linked_column.cpp src/utilities/logger.cpp + src/utilities/pinned_memory.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp src/utilities/traits.cpp diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp index ebcbcb17e98..df1492690bb 100644 --- a/cpp/benchmarks/fixture/nvbench_fixture.hpp +++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp @@ -15,8 +15,8 @@ */ #pragma once -#include #include +#include #include #include @@ -81,17 +81,18 @@ struct nvbench_base_fixture { "\nExpecting: cuda, pool, async, arena, managed, or managed_pool"); } - inline rmm::host_async_resource_ref make_cuio_host_pinned() + inline rmm::host_device_async_resource_ref make_cuio_host_pinned() { static std::shared_ptr mr = std::make_shared(); return *mr; } - inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode) + inline rmm::host_device_async_resource_ref create_cuio_host_memory_resource( + std::string const& mode) { if (mode == "pinned") return make_cuio_host_pinned(); - if (mode == "pinned_pool") return cudf::io::get_host_memory_resource(); + if (mode == "pinned_pool") return cudf::get_pinned_memory_resource(); CUDF_FAIL("Unknown cuio_host_mem parameter: " + mode + "\nExpecting: pinned or pinned_pool"); } @@ -112,14 +113,14 @@ struct nvbench_base_fixture { rmm::mr::set_current_device_resource(mr.get()); std::cout << "RMM memory resource = " << rmm_mode << "\n"; - cudf::io::set_host_memory_resource(create_cuio_host_memory_resource(cuio_host_mode)); + cudf::set_pinned_memory_resource(create_cuio_host_memory_resource(cuio_host_mode)); std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n"; } ~nvbench_base_fixture() { // Ensure the the pool is freed before the CUDA context is destroyed: - cudf::io::set_host_memory_resource(this->make_cuio_host_pinned()); + cudf::set_pinned_memory_resource(this->make_cuio_host_pinned()); } std::shared_ptr mr; diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 37ced8ea703..645994f3f0d 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -19,6 +19,9 @@ #include #include +#include +#include + #include #include @@ -28,6 +31,14 @@ temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"}; +// Don't use cudf's pinned pool for the source data +rmm::host_async_resource_ref pinned_memory_resource() +{ + static rmm::mr::pinned_host_memory_resource mr = rmm::mr::pinned_host_memory_resource{}; + + return mr; +} + std::string random_file_in_dir(std::string const& dir_path) { // `mkstemp` modifies the template in place @@ -41,6 +52,7 @@ std::string random_file_in_dir(std::string const& dir_path) cuio_source_sink_pair::cuio_source_sink_pair(io_type type) : type{type}, + pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}), d_buffer{0, cudf::get_default_stream()}, file_name{random_file_in_dir(tmpdir.path())}, void_sink{cudf::io::data_sink::create()} diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index d4f39a5f243..64d6021cf50 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -18,7 +18,7 @@ #include -#include +#include #include #include @@ -79,7 +79,7 @@ class cuio_source_sink_pair { io_type const type; std::vector h_buffer; - cudf::detail::pinned_host_vector pinned_buffer; + cudf::detail::host_vector pinned_buffer; rmm::device_uvector d_buffer; std::string const file_name; std::unique_ptr void_sink; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index a67d1932951..b4c8ed78ed8 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -20,9 +20,9 @@ #include #include -#include #include #include +#include #include #include diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index b5d855d8881..67705863d41 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -132,9 +131,10 @@ static void bench_multibyte_split(nvbench::state& state, auto const delim_factor = static_cast(delim_percent) / 100; std::unique_ptr datasource; - auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim); - auto host_input = std::vector{}; - auto host_pinned_input = cudf::detail::pinned_host_vector{}; + auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim); + auto host_input = std::vector{}; + auto host_pinned_input = + cudf::detail::make_pinned_vector_async(0, cudf::get_default_stream()); if (source_type != data_chunk_source_type::device && source_type != data_chunk_source_type::host_pinned) { diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp similarity index 93% rename from cpp/include/cudf/detail/utilities/rmm_host_vector.hpp rename to cpp/include/cudf/detail/utilities/host_vector.hpp index 6901a19473e..6a115177ab5 100644 --- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -32,8 +33,6 @@ namespace cudf::detail { /*! \p rmm_host_allocator is a CUDA-specific host memory allocator * that employs \c a `rmm::host_async_resource_ref` for allocation. * - * This implementation is ported from pinned_host_vector in cudf. - * * \see https://en.cppreference.com/w/cpp/memory/allocator */ template @@ -42,8 +41,6 @@ class rmm_host_allocator; /*! \p rmm_host_allocator is a CUDA-specific host memory allocator * that employs \c an `cudf::host_async_resource_ref` for allocation. * - * This implementation is ported from pinned_host_vector in cudf. - * * \see https://en.cppreference.com/w/cpp/memory/allocator */ template <> @@ -70,8 +67,7 @@ class rmm_host_allocator { * The \p rmm_host_allocator provides an interface for host memory allocation through the user * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of * this reference and therefore it is the user's responsibility to ensure its lifetime for the - * duration of the lifetime of the \p rmm_host_allocator. This implementation is ported from - * pinned_host_vector in cudf. + * duration of the lifetime of the \p rmm_host_allocator. * * \see https://en.cppreference.com/w/cpp/memory/allocator */ @@ -121,8 +117,12 @@ class rmm_host_allocator { inline pointer allocate(size_type cnt) { if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if - return static_cast( - mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream)); + auto const result = + mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + // Synchronize to ensure the memory is allocated before thrust::host_vector initialization + // TODO: replace thrust::host_vector with a type that does not require synchronization + stream.synchronize(); + return static_cast(result); } /** @@ -182,6 +182,6 @@ class rmm_host_allocator { * @brief A vector class with rmm host memory allocator */ template -using rmm_host_vector = thrust::host_vector>; +using host_vector = thrust::host_vector>; } // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp deleted file mode 100644 index c22b6a6ba15..00000000000 --- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2008-2024, NVIDIA CORPORATION - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -#include -#include -#include // for bad_alloc - -namespace cudf::detail { - -/*! \p pinned_allocator is a CUDA-specific host memory allocator - * that employs \c cudaMallocHost for allocation. - * - * This implementation is ported from the experimental/pinned_allocator - * that Thrust used to provide. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template -class pinned_allocator; - -/*! \p pinned_allocator is a CUDA-specific host memory allocator - * that employs \c cudaMallocHost for allocation. - * - * This implementation is ported from the experimental/pinned_allocator - * that Thrust used to provide. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template <> -class pinned_allocator { - public: - using value_type = void; ///< The type of the elements in the allocator - using pointer = void*; ///< The type returned by address() / allocate() - using const_pointer = void const*; ///< The type returned by address() - using size_type = std::size_t; ///< The type used for the size of the allocation - using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers - - /** - * @brief converts a `pinned_allocator` to `pinned_allocator` - */ - template - struct rebind { - using other = pinned_allocator; ///< The rebound type - }; -}; - -/*! \p pinned_allocator is a CUDA-specific host memory allocator - * that employs \c cudaMallocHost for allocation. - * - * This implementation is ported from the experimental/pinned_allocator - * that Thrust used to provide. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template -class pinned_allocator { - public: - using value_type = T; ///< The type of the elements in the allocator - using pointer = T*; ///< The type returned by address() / allocate() - using const_pointer = T const*; ///< The type returned by address() - using reference = T&; ///< The parameter type for address() - using const_reference = T const&; ///< The parameter type for address() - using size_type = std::size_t; ///< The type used for the size of the allocation - using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers - - /** - * @brief converts a `pinned_allocator` to `pinned_allocator` - */ - template - struct rebind { - using other = pinned_allocator; ///< The rebound type - }; - - /** - * @brief pinned_allocator's null constructor does nothing. - */ - __host__ __device__ inline pinned_allocator() {} - - /** - * @brief pinned_allocator's null destructor does nothing. - */ - __host__ __device__ inline ~pinned_allocator() {} - - /** - * @brief pinned_allocator's copy constructor does nothing. - */ - __host__ __device__ inline pinned_allocator(pinned_allocator const&) {} - - /** - * @brief pinned_allocator's copy constructor does nothing. - * - * This version of pinned_allocator's copy constructor - * is templated on the \c value_type of the pinned_allocator - * to copy from. It is provided merely for convenience; it - * does nothing. - */ - template - __host__ __device__ inline pinned_allocator(pinned_allocator const&) - { - } - - /** - * @brief This method returns the address of a \c reference of - * interest. - * - * @param r The \c reference of interest. - * @return \c r's address. - */ - __host__ __device__ inline pointer address(reference r) { return &r; } - - /** - * @brief This method returns the address of a \c const_reference - * of interest. - * - * @param r The \c const_reference of interest. - * @return \c r's address. - */ - __host__ __device__ inline const_pointer address(const_reference r) { return &r; } - - /** - * @brief This method allocates storage for objects in pinned host - * memory. - * - * @param cnt The number of objects to allocate. - * @return a \c pointer to the newly allocated objects. - * @note The second parameter to this function is meant as a - * hint pointer to a nearby memory location, but is - * not used by this allocator. - * @note This method does not invoke \p value_type's constructor. - * It is the responsibility of the caller to initialize the - * objects at the returned \c pointer. - */ - __host__ inline pointer allocate(size_type cnt, const_pointer /*hint*/ = 0) - { - if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if - - pointer result(0); - CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast(&result), cnt * sizeof(value_type))); - return result; - } - - /** - * @brief This method deallocates pinned host memory previously allocated - * with this \c pinned_allocator. - * - * @param p A \c pointer to the previously allocated memory. - * @note The second parameter is the number of objects previously allocated - * but is ignored by this allocator. - * @note This method does not invoke \p value_type's destructor. - * It is the responsibility of the caller to destroy - * the objects stored at \p p. - */ - __host__ inline void deallocate(pointer p, size_type /*cnt*/) - { - auto dealloc_worked = cudaFreeHost(p); - (void)dealloc_worked; - assert(dealloc_worked == cudaSuccess); - } - - /** - * @brief This method returns the maximum size of the \c cnt parameter - * accepted by the \p allocate() method. - * - * @return The maximum number of objects that may be allocated - * by a single call to \p allocate(). - */ - inline size_type max_size() const { return (std::numeric_limits::max)() / sizeof(T); } - - /** - * @brief This method tests this \p pinned_allocator for equality to - * another. - * - * @param x The other \p pinned_allocator of interest. - * @return This method always returns \c true. - */ - __host__ __device__ inline bool operator==(pinned_allocator const& x) const { return true; } - - /** - * @brief This method tests this \p pinned_allocator for inequality - * to another. - * - * @param x The other \p pinned_allocator of interest. - * @return This method always returns \c false. - */ - __host__ __device__ inline bool operator!=(pinned_allocator const& x) const - { - return !operator==(x); - } -}; - -/** - * @brief A vector class with pinned host memory allocator - */ -template -using pinned_host_vector = thrust::host_vector>; - -} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 293a4096c57..20cb55bb1c7 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -21,8 +21,10 @@ * @file vector_factories.hpp */ +#include #include #include +#include #include #include @@ -380,7 +382,7 @@ thrust::host_vector make_host_vector_async(device_span v, rmm::cuda_ * @brief Asynchronously construct a `std::vector` containing a copy of data from a device * container * - * @note This function synchronizes `stream`. + * @note This function does not synchronize `stream`. * * @tparam Container The type of the container to copy from * @tparam T The type of the data to copy @@ -439,6 +441,40 @@ thrust::host_vector make_host_vector_sync( return make_host_vector_sync(device_span{c}, stream); } +/** + * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size + * + * @note This function may not synchronize `stream`. + * + * @tparam T The type of the vector data + * @param size The number of elements in the created vector + * @param stream The stream on which to allocate memory + * @return A host_vector of the given size + */ +template +host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) +{ + return host_vector(size, {cudf::get_pinned_memory_resource(), stream}); +} + +/** + * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size + * + * @note This function synchronizes `stream`. + * + * @tparam T The type of the vector data + * @param size The number of elements in the created vector + * @param stream The stream on which to allocate memory + * @return A host_vector of the given size + */ +template +host_vector make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream) +{ + auto result = make_pinned_vector_async(size, stream); + stream.synchronize(); + return result; +} + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp deleted file mode 100644 index a36e220ae7b..00000000000 --- a/cpp/include/cudf/io/memory_resource.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace cudf::io { - -/** - * @brief Set the rmm resource to be used for host memory allocations by - * cudf::detail::hostdevice_vector - * - * hostdevice_vector is a utility class that uses a pair of host and device-side buffers for - * bouncing state between the cpu and the gpu. The resource set with this function (typically a - * pinned memory allocator) is what it uses to allocate space for it's host-side buffer. - * - * @param mr The rmm resource to be used for host-side allocations - * @return The previous resource that was in use - */ -rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr); - -/** - * @brief Get the rmm resource being used for host memory allocations by - * cudf::detail::hostdevice_vector - * - * @return The rmm resource used for host-side allocations - */ -rmm::host_async_resource_ref get_host_memory_resource(); - -/** - * @brief Options to configure the default host memory resource - */ -struct host_mr_options { - std::optional pool_size; ///< The size of the pool to use for the default host memory - ///< resource. If not set, the default pool size is used. -}; - -/** - * @brief Configure the size of the default host memory resource. - * - * @throws cudf::logic_error if called after the default host memory resource has been created - * - * @param opts Options to configure the default host memory resource - * @return True if this call successfully configured the host memory resource, false if a - * a resource was already configured. - */ -bool config_default_host_memory_resource(host_mr_options const& opts); - -} // namespace cudf::io diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp new file mode 100644 index 00000000000..b423eab6d38 --- /dev/null +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { + +/** + * @brief Set the rmm resource to be used for pinned memory allocations. + * + * @param mr The rmm resource to be used for pinned allocations + * @return The previous resource that was in use + */ +rmm::host_device_async_resource_ref set_pinned_memory_resource( + rmm::host_device_async_resource_ref mr); + +/** + * @brief Get the rmm resource being used for pinned memory allocations. + * + * @return The rmm resource used for pinned allocations + */ +rmm::host_device_async_resource_ref get_pinned_memory_resource(); + +/** + * @brief Options to configure the default pinned memory resource + */ +struct pinned_mr_options { + std::optional pool_size; ///< The size of the pool to use for the default pinned memory + ///< resource. If not set, the default pool size is used. +}; + +/** + * @brief Configure the size of the default pinned memory resource. + * + * @param opts Options to configure the default pinned memory resource + * @return True if this call successfully configured the pinned memory resource, false if a + * a resource was already configured. + */ +bool config_default_pinned_memory_resource(pinned_mr_options const& opts); + +} // namespace cudf diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 5dee0c17a33..05faded651d 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -27,6 +27,7 @@ #include "io/utilities/parsing_utils.cuh" #include +#include #include #include #include diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 5034aa14a95..43301826003 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -22,6 +22,7 @@ #include #include +#include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 344e216cdc8..e9e031a407a 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -2339,7 +2338,7 @@ auto convert_table_to_orc_data(table_view const& input, std::move(streams), std::move(stripes), std::move(stripe_dicts.views), - cudf::detail::pinned_host_vector()}; + cudf::detail::make_pinned_vector_async(0, stream)}; } // Allocate intermediate output stream buffer @@ -2407,7 +2406,7 @@ auto convert_table_to_orc_data(table_view const& input, return max_stream_size; }(); - cudf::detail::pinned_host_vector bounce_buffer(max_out_stream_size); + auto bounce_buffer = cudf::detail::make_pinned_vector_async(max_out_stream_size, stream); auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream); diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index eb653c6b9ac..9de8a9e2719 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -23,6 +23,8 @@ #include "ipc/Message_generated.h" #include "ipc/Schema_generated.h" +#include + #include #include diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 1dfced94f5b..6d466748c17 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -2278,7 +2277,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } auto bounce_buffer = - cudf::detail::pinned_host_vector(all_device_write ? 0 : max_write_size); + cudf::detail::make_pinned_vector_async(all_device_write ? 0 : max_write_size, stream); return std::tuple{std::move(agg_meta), std::move(pages), diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index faa09e586ab..0e3ce779089 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -19,8 +19,9 @@ #include "io/utilities/config_utils.hpp" #include +#include #include -#include +#include #include #include #include @@ -66,7 +67,7 @@ struct bgzip_nvcomp_transform_functor { class bgzip_data_chunk_reader : public data_chunk_reader { private: template - static void copy_to_device(cudf::detail::pinned_host_vector const& host, + static void copy_to_device(cudf::detail::host_vector const& host, rmm::device_uvector& device, rmm::cuda_stream_view stream) { @@ -84,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader { 1 << 16; // 64k offset allocation, resized on demand cudaEvent_t event; - cudf::detail::pinned_host_vector h_compressed_blocks; - cudf::detail::pinned_host_vector h_compressed_offsets; - cudf::detail::pinned_host_vector h_decompressed_offsets; + cudf::detail::host_vector h_compressed_blocks; + cudf::detail::host_vector h_compressed_offsets; + cudf::detail::host_vector h_decompressed_offsets; rmm::device_uvector d_compressed_blocks; rmm::device_uvector d_decompressed_blocks; rmm::device_uvector d_compressed_offsets; @@ -103,7 +104,10 @@ class bgzip_data_chunk_reader : public data_chunk_reader { bool is_decompressed{}; decompression_blocks(rmm::cuda_stream_view init_stream) - : d_compressed_blocks(0, init_stream), + : h_compressed_blocks{cudf::detail::make_pinned_vector_async(0, init_stream)}, + h_compressed_offsets{cudf::detail::make_pinned_vector_async(0, init_stream)}, + h_decompressed_offsets{cudf::detail::make_pinned_vector_async(0, init_stream)}, + d_compressed_blocks(0, init_stream), d_decompressed_blocks(0, init_stream), d_compressed_offsets(0, init_stream), d_decompressed_offsets(0, init_stream), diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 9d1d0498ace..596ca3458c8 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,12 @@ * limitations under the License. */ +#include "cudf/utilities/default_stream.hpp" #include "io/text/device_data_chunks.hpp" #include -#include +#include +#include #include #include @@ -31,8 +33,15 @@ namespace cudf::io::text { namespace { struct host_ticket { - cudaEvent_t event; - cudf::detail::pinned_host_vector buffer; + cudaEvent_t event{}; // tracks the completion of the last device-to-host copy. + cudf::detail::host_vector buffer; + + host_ticket() : buffer{cudf::detail::make_pinned_vector_sync(0, cudf::get_default_stream())} + { + cudaEventCreate(&event); + } + + ~host_ticket() { cudaEventDestroy(event); } }; /** @@ -43,20 +52,7 @@ class datasource_chunk_reader : public data_chunk_reader { constexpr static int num_tickets = 2; public: - datasource_chunk_reader(datasource* source) : _source(source) - { - // create an event to track the completion of the last device-to-host copy. - for (auto& ticket : _tickets) { - CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event))); - } - } - - ~datasource_chunk_reader() override - { - for (auto& ticket : _tickets) { - CUDF_CUDA_TRY(cudaEventDestroy(ticket.event)); - } - } + datasource_chunk_reader(datasource* source) : _source(source) {} void skip_bytes(std::size_t size) override { @@ -84,7 +80,9 @@ class datasource_chunk_reader : public data_chunk_reader { CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes - if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); } + if (h_ticket.buffer.size() < read_size) { + h_ticket.buffer = cudf::detail::make_pinned_vector_sync(read_size, stream); + } _source->host_read(_offset, read_size, reinterpret_cast(h_ticket.buffer.data())); @@ -120,17 +118,6 @@ class istream_data_chunk_reader : public data_chunk_reader { istream_data_chunk_reader(std::unique_ptr datastream) : _datastream(std::move(datastream)) { - // create an event to track the completion of the last device-to-host copy. - for (auto& ticket : _tickets) { - CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event))); - } - } - - ~istream_data_chunk_reader() override - { - for (auto& ticket : _tickets) { - CUDF_CUDA_TRY(cudaEventDestroy(ticket.event)); - } } void skip_bytes(std::size_t size) override { _datastream->ignore(size); }; @@ -148,7 +135,9 @@ class istream_data_chunk_reader : public data_chunk_reader { CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes - if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); } + if (h_ticket.buffer.size() < read_size) { + h_ticket.buffer = cudf::detail::make_pinned_vector_sync(read_size, stream); + } // read data from the host istream in to the pinned host memory buffer _datastream->read(h_ticket.buffer.data(), read_size); diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index dad1135e766..20ac89b4d53 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -16,22 +16,12 @@ #include "config_utils.hpp" -#include -#include #include -#include - -#include -#include -#include -#include #include #include -namespace cudf::io { - -namespace detail { +namespace cudf::io::detail { namespace cufile_integration { @@ -90,204 +80,4 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_ } // namespace nvcomp_integration -} // namespace detail - -namespace { -class fixed_pinned_pool_memory_resource { - using upstream_mr = rmm::mr::pinned_host_memory_resource; - using host_pooled_mr = rmm::mr::pool_memory_resource; - - private: - upstream_mr upstream_mr_{}; - size_t pool_size_{0}; - // Raw pointer to avoid a segfault when the pool is destroyed on exit - host_pooled_mr* pool_{nullptr}; - void* pool_begin_{nullptr}; - void* pool_end_{nullptr}; - cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()}; - - public: - fixed_pinned_pool_memory_resource(size_t size) - : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} - { - if (pool_size_ == 0) { return; } - - // Allocate full size from the pinned pool to figure out the beginning and end address - pool_begin_ = pool_->allocate_async(pool_size_, stream_); - pool_end_ = static_cast(static_cast(pool_begin_) + pool_size_); - pool_->deallocate_async(pool_begin_, pool_size_, stream_); - } - - void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) - { - if (bytes <= pool_size_) { - try { - return pool_->allocate_async(bytes, alignment, stream); - } catch (...) { - // If the pool is exhausted, fall back to the upstream memory resource - } - } - - return upstream_mr_.allocate_async(bytes, alignment, stream); - } - - void do_deallocate_async(void* ptr, - std::size_t bytes, - std::size_t alignment, - cuda::stream_ref stream) noexcept - { - if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) { - pool_->deallocate_async(ptr, bytes, alignment, stream); - } else { - upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); - } - } - - void* allocate_async(std::size_t bytes, cuda::stream_ref stream) - { - return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); - } - - void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) - { - return do_allocate_async(bytes, alignment, stream); - } - - void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) - { - auto const result = do_allocate_async(bytes, alignment, stream_); - stream_.wait(); - return result; - } - - void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept - { - return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); - } - - void deallocate_async(void* ptr, - std::size_t bytes, - std::size_t alignment, - cuda::stream_ref stream) noexcept - { - return do_deallocate_async(ptr, bytes, alignment, stream); - } - - void deallocate(void* ptr, - std::size_t bytes, - std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept - { - deallocate_async(ptr, bytes, alignment, stream_); - stream_.wait(); - } - - bool operator==(fixed_pinned_pool_memory_resource const& other) const - { - return pool_ == other.pool_ and stream_ == other.stream_; - } - - bool operator!=(fixed_pinned_pool_memory_resource const& other) const - { - return !operator==(other); - } - - [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, - cuda::mr::device_accessible) noexcept - { - } - - [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, - cuda::mr::host_accessible) noexcept - { - } -}; - -static_assert(cuda::mr::resource_with, - ""); - -} // namespace - -CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional config_size) -{ - static fixed_pinned_pool_memory_resource mr = [config_size]() { - auto const size = [&config_size]() -> size_t { - if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) { - return std::atol(env_val); - } - - if (config_size.has_value()) { return *config_size; } - - size_t free{}, total{}; - CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total)); - // 0.5% of the total device memory, capped at 100MB - return std::min(total / 200, size_t{100} * 1024 * 1024); - }(); - - // rmm requires the pool size to be a multiple of 256 bytes - auto const aligned_size = (size + 255) & ~255; - CUDF_LOG_INFO("Pinned pool size = {}", aligned_size); - - // make the pool with max size equal to the initial size - return fixed_pinned_pool_memory_resource{aligned_size}; - }(); - - static rmm::host_async_resource_ref mr_ref{mr}; - return mr_ref; -} - -CUDF_EXPORT std::mutex& host_mr_mutex() -{ - static std::mutex map_lock; - return map_lock; -} - -// Must be called with the host_mr_mutex mutex held -CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional const& opts, - bool* did_configure = nullptr) -{ - static rmm::host_async_resource_ref* mr_ref = nullptr; - bool configured = false; - if (mr_ref == nullptr) { - configured = true; - mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt); - } - - // If the user passed an out param to detect whether this call configured a resource - // set the result - if (did_configure != nullptr) { *did_configure = configured; } - - return *mr_ref; -} - -// Must be called with the host_mr_mutex mutex held -CUDF_EXPORT rmm::host_async_resource_ref& host_mr() -{ - static rmm::host_async_resource_ref mr_ref = make_host_mr(std::nullopt); - return mr_ref; -} - -rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr) -{ - std::scoped_lock lock{host_mr_mutex()}; - auto last_mr = host_mr(); - host_mr() = mr; - return last_mr; -} - -rmm::host_async_resource_ref get_host_memory_resource() -{ - std::scoped_lock lock{host_mr_mutex()}; - return host_mr(); -} - -bool config_default_host_memory_resource(host_mr_options const& opts) -{ - std::scoped_lock lock{host_mr_mutex()}; - auto did_configure = false; - make_host_mr(opts, &did_configure); - return did_configure; -} - -} // namespace cudf::io +} // namespace cudf::io::detail diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 0883ac3609f..1ae27a2f4ae 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -16,11 +16,10 @@ #pragma once -#include "config_utils.hpp" #include "hostdevice_span.hpp" -#include -#include +#include +#include #include #include #include @@ -53,7 +52,7 @@ class hostdevice_vector { } explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream) - : h_data({cudf::io::get_host_memory_resource(), stream}), d_data(max_size, stream) + : h_data{make_pinned_vector_async(0, stream)}, d_data(max_size, stream) { CUDF_EXPECTS(initial_size <= max_size, "initial_size cannot be larger than max_size"); @@ -173,7 +172,7 @@ class hostdevice_vector { } private: - cudf::detail::rmm_host_vector h_data; + cudf::detail::host_vector h_data; rmm::device_uvector d_data; }; diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp new file mode 100644 index 00000000000..5d2e3ac332a --- /dev/null +++ b/cpp/src/utilities/pinned_memory.cpp @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf { + +namespace { +class fixed_pinned_pool_memory_resource { + using upstream_mr = rmm::mr::pinned_host_memory_resource; + using host_pooled_mr = rmm::mr::pool_memory_resource; + + private: + upstream_mr upstream_mr_{}; + size_t pool_size_{0}; + // Raw pointer to avoid a segfault when the pool is destroyed on exit + host_pooled_mr* pool_{nullptr}; + void* pool_begin_{nullptr}; + void* pool_end_{nullptr}; + cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()}; + + public: + fixed_pinned_pool_memory_resource(size_t size) + : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} + { + if (pool_size_ == 0) { return; } + + // Allocate full size from the pinned pool to figure out the beginning and end address + pool_begin_ = pool_->allocate_async(pool_size_, stream_); + pool_end_ = static_cast(static_cast(pool_begin_) + pool_size_); + pool_->deallocate_async(pool_begin_, pool_size_, stream_); + } + + void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) + { + if (bytes <= pool_size_) { + try { + return pool_->allocate_async(bytes, alignment, stream); + } catch (...) { + // If the pool is exhausted, fall back to the upstream memory resource + } + } + + return upstream_mr_.allocate_async(bytes, alignment, stream); + } + + void* allocate_async(std::size_t bytes, cuda::stream_ref stream) + { + return allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) + { + auto const result = allocate_async(bytes, alignment, stream_); + stream_.wait(); + return result; + } + + void deallocate_async(void* ptr, + std::size_t bytes, + std::size_t alignment, + cuda::stream_ref stream) noexcept + { + if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) { + pool_->deallocate_async(ptr, bytes, alignment, stream); + } else { + upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); + } + } + + void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept + { + return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + void deallocate(void* ptr, + std::size_t bytes, + std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept + { + deallocate_async(ptr, bytes, alignment, stream_); + stream_.wait(); + } + + bool operator==(fixed_pinned_pool_memory_resource const& other) const + { + return pool_ == other.pool_ and stream_ == other.stream_; + } + + bool operator!=(fixed_pinned_pool_memory_resource const& other) const + { + return !operator==(other); + } + + friend void get_property(fixed_pinned_pool_memory_resource const&, + cuda::mr::device_accessible) noexcept + { + } + + friend void get_property(fixed_pinned_pool_memory_resource const&, + cuda::mr::host_accessible) noexcept + { + } +}; + +static_assert(cuda::mr::resource_with, + "Pinned pool mr must be accessible from both host and device"); + +CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( + std::optional config_size) +{ + static fixed_pinned_pool_memory_resource mr = [config_size]() { + auto const size = [&config_size]() -> size_t { + if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) { + return std::atol(env_val); + } + + if (config_size.has_value()) { return *config_size; } + + auto const total = rmm::available_device_memory().second; + // 0.5% of the total device memory, capped at 100MB + return std::min(total / 200, size_t{100} * 1024 * 1024); + }(); + + // rmm requires the pool size to be a multiple of 256 bytes + auto const aligned_size = rmm::align_up(size, rmm::RMM_DEFAULT_HOST_ALIGNMENT); + CUDF_LOG_INFO("Pinned pool size = {}", aligned_size); + + // make the pool with max size equal to the initial size + return fixed_pinned_pool_memory_resource{aligned_size}; + }(); + + static rmm::host_device_async_resource_ref mr_ref{mr}; + return mr_ref; +} + +CUDF_EXPORT std::mutex& host_mr_mutex() +{ + static std::mutex map_lock; + return map_lock; +} + +// Must be called with the host_mr_mutex mutex held +CUDF_EXPORT rmm::host_device_async_resource_ref& make_host_mr( + std::optional const& opts, bool* did_configure = nullptr) +{ + static rmm::host_device_async_resource_ref* mr_ref = nullptr; + bool configured = false; + if (mr_ref == nullptr) { + configured = true; + mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt); + } + + // If the user passed an out param to detect whether this call configured a resource + // set the result + if (did_configure != nullptr) { *did_configure = configured; } + + return *mr_ref; +} + +// Must be called with the host_mr_mutex mutex held +CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr() +{ + static rmm::host_device_async_resource_ref mr_ref = make_host_mr(std::nullopt); + return mr_ref; +} + +} // namespace + +rmm::host_device_async_resource_ref set_pinned_memory_resource( + rmm::host_device_async_resource_ref mr) +{ + std::scoped_lock lock{host_mr_mutex()}; + auto last_mr = host_mr(); + host_mr() = mr; + return last_mr; +} + +rmm::host_device_async_resource_ref get_pinned_memory_resource() +{ + std::scoped_lock lock{host_mr_mutex()}; + return host_mr(); +} + +bool config_default_pinned_memory_resource(pinned_mr_options const& opts) +{ + std::scoped_lock lock{host_mr_mutex()}; + auto did_configure = false; + make_host_mr(opts, &did_configure); + return did_configure; +} + +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 826f879ddc0..f6d762cc2ec 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -380,15 +380,16 @@ ConfigureTest( # * utilities tests ------------------------------------------------------------------------------- ConfigureTest( UTILITIES_TEST - utilities_tests/type_list_tests.cpp utilities_tests/column_debug_tests.cpp utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp + utilities_tests/default_stream_tests.cpp utilities_tests/io_utilities_tests.cpp utilities_tests/lists_column_wrapper_tests.cpp utilities_tests/logger_tests.cpp - utilities_tests/default_stream_tests.cpp + utilities_tests/pinned_memory_tests.cpp utilities_tests/type_check_tests.cpp + utilities_tests/type_list_tests.cpp ) # ################################################################################################## diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 57aa2721756..4c01a1fb87b 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -28,13 +28,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include @@ -2068,7 +2068,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync) size_t{128} * 1024 * 1024}; // Set new resource - auto last_mr = cudf::io::set_host_memory_resource(mr); + auto last_mr = cudf::set_pinned_memory_resource(mr); /** * @brief Spark has the specific need to ignore extra characters that come after the first record @@ -2158,7 +2158,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync) float64_wrapper{c_data.cbegin(), c_data.cend(), c_validity.cbegin()}); // Restore original memory source - cudf::io::set_host_memory_resource(last_mr); + cudf::set_pinned_memory_resource(last_mr); } TEST_F(JsonReaderTest, MixedTypes) diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp index e5a153bf781..9ed8f18f5cc 100644 --- a/cpp/tests/utilities_tests/io_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp @@ -16,14 +16,6 @@ #include #include -#include - -#include -#include - -#include -#include -#include #include @@ -32,43 +24,6 @@ using cudf::io::detail::base64_encode; class IoUtilitiesTest : public cudf::test::BaseFixture {}; -TEST(IoUtilitiesTest, HostMemoryGetAndSet) -{ - // Global environment for temporary files - auto const temp_env = static_cast( - ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); - - // pinned/pooled host memory resource - using host_pooled_mr = rmm::mr::pool_memory_resource; - host_pooled_mr mr(std::make_shared().get(), - size_t{128} * 1024 * 1024); - - // set new resource - auto last_mr = cudf::io::get_host_memory_resource(); - cudf::io::set_host_memory_resource(mr); - - constexpr int num_rows = 32 * 1024; - auto valids = - cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; }); - auto values = thrust::make_counting_iterator(0); - - cudf::test::fixed_width_column_wrapper col(values, values + num_rows, valids); - - cudf::table_view expected({col}); - auto filepath = temp_env->get_temp_filepath("IoUtilsMemTest.parquet"); - cudf::io::parquet_writer_options out_args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_args); - - cudf::io::parquet_reader_options const read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(read_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected); - - // reset memory resource back - cudf::io::set_host_memory_resource(last_mr); -} - TEST(IoUtilitiesTest, Base64EncodeAndDecode) { // a vector of lorem ipsum strings diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp new file mode 100644 index 00000000000..df9103640f4 --- /dev/null +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +class PinnedMemoryTest : public cudf::test::BaseFixture {}; + +TEST(PinnedMemoryTest, MemoryResourceGetAndSet) +{ + // Global environment for temporary files + auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + + // pinned/pooled host memory resource + using host_pooled_mr = rmm::mr::pool_memory_resource; + host_pooled_mr mr(std::make_shared().get(), + 4 * 1024 * 1024); + + // set new resource + auto last_mr = cudf::get_pinned_memory_resource(); + cudf::set_pinned_memory_resource(mr); + + constexpr int num_rows = 32 * 1024; + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; }); + auto values = thrust::make_counting_iterator(0); + + cudf::test::fixed_width_column_wrapper col(values, values + num_rows, valids); + + cudf::table_view expected({col}); + auto filepath = temp_env->get_temp_filepath("MemoryResourceGetAndSetTest.parquet"); + cudf::io::parquet_writer_options out_args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_args); + + cudf::io::parquet_reader_options const read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(read_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected); + + // reset memory resource back + cudf::set_pinned_memory_resource(last_mr); +} diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java index 83b801db7fb..df0d9dc7c3e 100644 --- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java +++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java @@ -128,9 +128,9 @@ public static synchronized void initialize(long poolSize, int gpuId) { * * @param poolSize size of the pool to initialize. * @param gpuId gpu id to set to get memory pool from, -1 means to use default - * @param setCuioHostMemoryResource true if this pinned pool should be used by cuIO for host memory + * @param setCudfPinnedPoolMemoryResource true if this pinned pool should be used by cuDF for pinned memory */ - public static synchronized void initialize(long poolSize, int gpuId, boolean setCuioHostMemoryResource) { + public static synchronized void initialize(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) { if (isInitialized()) { throw new IllegalStateException("Can only initialize the pool once."); } @@ -139,7 +139,7 @@ public static synchronized void initialize(long poolSize, int gpuId, boolean set t.setDaemon(true); return t; }); - initFuture = initService.submit(() -> new PinnedMemoryPool(poolSize, gpuId, setCuioHostMemoryResource)); + initFuture = initService.submit(() -> new PinnedMemoryPool(poolSize, gpuId, setCudfPinnedPoolMemoryResource)); initService.shutdown(); } @@ -216,15 +216,15 @@ public static long getTotalPoolSizeBytes() { return 0; } - private PinnedMemoryPool(long poolSize, int gpuId, boolean setCuioHostMemoryResource) { + private PinnedMemoryPool(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) { if (gpuId > -1) { // set the gpu device to use Cuda.setDevice(gpuId); Cuda.freeZero(); } this.poolHandle = Rmm.newPinnedPoolMemoryResource(poolSize, poolSize); - if (setCuioHostMemoryResource) { - Rmm.setCuioPinnedPoolMemoryResource(this.poolHandle); + if (setCudfPinnedPoolMemoryResource) { + Rmm.setCudfPinnedPoolMemoryResource(this.poolHandle); } this.poolSize = poolSize; } diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 4dee1b7aa24..ed029c918e4 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -597,7 +597,7 @@ static native long newEventHandlerResourceAdaptor(long handle, long trackerHandl public static native long newPinnedPoolMemoryResource(long initSize, long maxSize); - public static native long setCuioPinnedPoolMemoryResource(long poolPtr); + public static native long setCudfPinnedPoolMemoryResource(long poolPtr); public static native void releasePinnedPoolMemoryResource(long poolPtr); diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index fa78f6ca4e2..8bd0f7793b4 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -16,7 +16,7 @@ #include "cudf_jni_apis.hpp" -#include +#include #include #include @@ -395,15 +395,17 @@ class java_debug_event_handler_memory_resource final : public java_event_handler } }; -inline auto& prior_cuio_host_mr() +inline auto& prior_cudf_pinned_mr() { - static rmm::host_async_resource_ref _prior_cuio_host_mr = cudf::io::get_host_memory_resource(); - return _prior_cuio_host_mr; + static rmm::host_device_async_resource_ref _prior_cudf_pinned_mr = + cudf::get_pinned_memory_resource(); + return _prior_cudf_pinned_mr; } /** * This is a pinned fallback memory resource that will try to allocate `pool` - * and if that fails, attempt to allocate from the prior resource used by cuIO `prior_cuio_host_mr`. + * and if that fails, attempt to allocate from the prior resource used by cuDF + * `prior_cudf_pinned_mr`. * * We detect whether a pointer to free is inside of the pool by checking its address (see * constructor) @@ -433,7 +435,7 @@ class pinned_fallback_host_memory_resource { /** * @brief Allocates pinned host memory of size at least \p bytes bytes from either the - * _pool argument provided, or prior_cuio_host_mr. + * _pool argument provided, or prior_cudf_pinned_mr. * * @throws rmm::bad_alloc if the requested allocation could not be fulfilled due to any other * reason. @@ -450,7 +452,7 @@ class pinned_fallback_host_memory_resource { return _pool->allocate(bytes, alignment); } catch (const std::exception& unused) { // try to allocate using the underlying pinned resource - return prior_cuio_host_mr().allocate(bytes, alignment); + return prior_cudf_pinned_mr().allocate(bytes, alignment); } // we should not reached here return nullptr; @@ -459,7 +461,7 @@ class pinned_fallback_host_memory_resource { /** * @brief Deallocate memory pointed to by \p ptr of size \p bytes bytes. We attempt * to deallocate from _pool, if ptr is detected to be in the pool address range, - * otherwise we deallocate from `prior_cuio_host_mr`. + * otherwise we deallocate from `prior_cudf_pinned_mr`. * * @param ptr Pointer to be deallocated. * @param bytes Size of the allocation. @@ -472,7 +474,7 @@ class pinned_fallback_host_memory_resource { if (ptr >= pool_begin_ && ptr <= pool_end_) { _pool->deallocate(ptr, bytes, alignment); } else { - prior_cuio_host_mr().deallocate(ptr, bytes, alignment); + prior_cudf_pinned_mr().deallocate(ptr, bytes, alignment); } } @@ -1025,7 +1027,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIE CATCH_STD(env, 0) } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv* env, +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCudfPinnedPoolMemoryResource(JNIEnv* env, jclass clazz, jlong pool_ptr) { @@ -1035,7 +1037,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(J // create a pinned fallback pool that will allocate pinned memory // if the regular pinned pool is exhausted pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool)); - prior_cuio_host_mr() = cudf::io::set_host_memory_resource(*pinned_fallback_mr); + prior_cudf_pinned_mr() = cudf::set_pinned_memory_resource(*pinned_fallback_mr); } CATCH_STD(env, ) } @@ -1047,8 +1049,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(J try { cudf::jni::auto_set_device(env); // set the cuio host memory resource to what it was before, or the same - // if we didn't overwrite it with setCuioPinnedPoolMemoryResource - cudf::io::set_host_memory_resource(prior_cuio_host_mr()); + // if we didn't overwrite it with setCudfPinnedPoolMemoryResource + cudf::set_pinned_memory_resource(prior_cudf_pinned_mr()); pinned_fallback_mr.reset(); delete reinterpret_cast(pool_ptr); } @@ -1088,7 +1090,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIE jlong size) { cudf::jni::auto_set_device(env); - void* ret = cudf::io::get_host_memory_resource().allocate(size); + void* ret = cudf::get_pinned_memory_resource().allocate(size); return reinterpret_cast(ret); } @@ -1101,7 +1103,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv try { cudf::jni::auto_set_device(env); void* cptr = reinterpret_cast(ptr); - cudf::io::get_host_memory_resource().deallocate(cptr, size); + cudf::get_pinned_memory_resource().deallocate(cptr, size); } CATCH_STD(env, ) } @@ -1112,7 +1114,7 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoo { try { cudf::jni::auto_set_device(env); - return cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size}); + return cudf::config_default_pinned_memory_resource(cudf::pinned_mr_options{size}); } CATCH_STD(env, false) } From 2b1029908af97b74304169631189dd57f382f072 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 12 Jun 2024 01:14:31 -0700 Subject: [PATCH 086/340] Apply clang-tidy autofixes (#15894) This changeset is large, but it's not very substantial. It's all the automated fixes produced by clang-tidy using our script. The bulk of the changes are either adding `[[nodiscard]]` to many functions or changing const ref args to pass by value and then move in cases where the parameter is only used to set a value. There are also some places where clang-tidy preferred either more or less namespacing of objects depending on the current namespace. The goal is to enable clang-tidy in CI, which we made progress towards in #9860 but stalled in #10064. This PR contains the first set of changes that will required for such a check to pass. I've marked this PR as breaking because some of the functions now marked as `[[nodiscard]]` are public APIs, so if consumers were ignoring the return values they will now see warnings, and if they are compiling with warnings as errors then the builds will break. Contributes to #584 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15894 --- .pre-commit-config.yaml | 8 + cpp/include/cudf/ast/expressions.hpp | 7 +- .../cudf/column/column_device_view.cuh | 10 +- .../cudf/detail/aggregation/aggregation.hpp | 27 +- cpp/include/cudf/detail/contiguous_split.hpp | 2 +- .../cudf/detail/normalizing_iterator.cuh | 8 +- cpp/include/cudf/detail/structs/utilities.hpp | 24 +- .../cudf/detail/utilities/host_vector.hpp | 4 +- .../cudf/detail/utilities/stream_pool.hpp | 2 +- cpp/include/cudf/fixed_point/fixed_point.hpp | 6 +- cpp/include/cudf/interop.hpp | 4 +- cpp/include/cudf/interop/detail/arrow.hpp | 7 +- cpp/include/cudf/io/arrow_io_source.hpp | 8 +- cpp/include/cudf/io/csv.hpp | 22 +- cpp/include/cudf/io/detail/parquet.hpp | 2 +- cpp/include/cudf/io/json.hpp | 42 +- cpp/include/cudf/io/orc.hpp | 26 +- cpp/include/cudf/io/parquet.hpp | 6 +- cpp/include/cudf/io/types.hpp | 5 +- cpp/include/cudf/join.hpp | 33 +- cpp/include/cudf/scalar/scalar.hpp | 19 +- .../cudf/strings/regex/regex_program.hpp | 14 +- cpp/include/cudf/strings/string_view.cuh | 8 +- cpp/include/cudf/table/table.hpp | 2 +- cpp/include/cudf/table/table_view.hpp | 4 +- cpp/include/cudf/utilities/error.hpp | 8 +- cpp/include/cudf/utilities/span.hpp | 24 +- cpp/include/cudf/utilities/thread_pool.hpp | 6 +- cpp/include/cudf/wrappers/dictionary.hpp | 2 +- cpp/include/cudf/wrappers/durations.hpp | 16 +- cpp/include/cudf/wrappers/timestamps.hpp | 16 +- cpp/include/cudf_test/base_fixture.hpp | 2 +- cpp/include/cudf_test/column_wrapper.hpp | 15 +- .../stream_checking_resource_adaptor.hpp | 2 +- cpp/src/binaryop/binaryop.cpp | 2 +- cpp/src/binaryop/compiled/operation.cuh | 8 +- cpp/src/binaryop/compiled/util.cpp | 4 +- cpp/src/copying/pack.cpp | 2 +- cpp/src/datetime/timezone.cpp | 2 +- cpp/src/interop/arrow_utilities.cpp | 2 +- cpp/src/interop/arrow_utilities.hpp | 2 +- cpp/src/interop/detail/arrow_allocator.cpp | 2 +- cpp/src/interop/from_arrow_host.cu | 4 +- cpp/src/io/avro/avro.cpp | 6 +- cpp/src/io/comp/uncomp.cpp | 8 +- cpp/src/io/functions.cpp | 8 +- cpp/src/io/json/nested_json_gpu.cu | 8 +- cpp/src/io/json/read_json.cu | 2 +- cpp/src/io/orc/orc.hpp | 2 +- cpp/src/io/orc/orc_field_writer.hpp | 6 +- cpp/src/io/orc/reader_impl_chunking.cu | 2 +- cpp/src/io/orc/reader_impl_decode.cu | 2 +- .../io/parquet/compact_protocol_reader.cpp | 2 +- .../io/parquet/compact_protocol_writer.hpp | 4 +- cpp/src/io/parquet/ipc/Schema_generated.h | 416 +++++++++--------- cpp/src/io/parquet/page_string_decode.cu | 10 +- cpp/src/io/parquet/page_string_utils.cuh | 4 +- cpp/src/io/parquet/parquet.hpp | 30 +- cpp/src/io/parquet/parquet_gpu.hpp | 33 +- cpp/src/io/parquet/predicate_pushdown.cpp | 4 +- cpp/src/io/parquet/reader_impl_chunking.cu | 2 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 26 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 8 +- cpp/src/io/statistics/byte_array_view.cuh | 6 +- cpp/src/io/utilities/arrow_io_source.cpp | 6 +- cpp/src/io/utilities/column_buffer.cpp | 20 +- cpp/src/io/utilities/column_buffer.hpp | 21 +- cpp/src/io/utilities/data_casting.cu | 4 +- cpp/src/io/utilities/data_sink.cpp | 8 +- cpp/src/io/utilities/datasource.cpp | 2 +- cpp/src/io/utilities/file_io_utilities.cpp | 8 +- cpp/src/io/utilities/hostdevice_span.hpp | 2 +- cpp/src/io/utilities/hostdevice_vector.hpp | 2 +- cpp/src/io/utilities/output_builder.cuh | 4 +- cpp/src/io/utilities/string_parsing.hpp | 6 +- cpp/src/io/utilities/type_inference.cu | 2 +- cpp/src/jit/cache.cpp | 4 +- cpp/src/jit/parser.cpp | 17 +- cpp/src/jit/parser.hpp | 8 +- cpp/src/reductions/reductions.cpp | 6 +- .../detail/optimized_unbounded_window.cpp | 2 +- cpp/src/strings/regex/regcomp.cpp | 26 +- cpp/src/strings/regex/regex.cuh | 45 +- cpp/src/strings/regex/regex.inl | 7 +- cpp/src/strings/regex/regexec.cpp | 14 +- cpp/src/transform/transform.cpp | 2 +- cpp/src/utilities/stream_pool.cpp | 4 +- .../binop-compiled-fixed_point-test.cpp | 8 +- cpp/tests/bitmask/is_element_valid_tests.cpp | 8 +- cpp/tests/column/column_view_shallow_test.cpp | 3 +- cpp/tests/copying/concatenate_tests.cpp | 79 ++-- cpp/tests/copying/copy_tests.cpp | 5 +- cpp/tests/copying/gather_str_tests.cpp | 27 +- cpp/tests/copying/gather_struct_tests.cpp | 4 +- cpp/tests/copying/get_value_tests.cpp | 12 +- cpp/tests/copying/pack_tests.cpp | 86 ++-- cpp/tests/copying/scatter_list_tests.cpp | 11 +- cpp/tests/copying/scatter_struct_tests.cpp | 9 +- cpp/tests/copying/scatter_tests.cpp | 47 +- cpp/tests/copying/shift_tests.cpp | 57 +-- cpp/tests/copying/slice_tests.cpp | 69 ++- cpp/tests/copying/split_tests.cpp | 123 ++++-- cpp/tests/dictionary/decode_test.cpp | 5 +- cpp/tests/dictionary/encode_test.cpp | 5 +- cpp/tests/dictionary/factories_test.cpp | 6 +- cpp/tests/dictionary/fill_test.cpp | 10 +- cpp/tests/dictionary/gather_test.cpp | 5 +- cpp/tests/dictionary/remove_keys_test.cpp | 14 +- cpp/tests/dictionary/scatter_test.cpp | 19 +- cpp/tests/dictionary/search_test.cpp | 6 +- cpp/tests/dictionary/set_keys_test.cpp | 12 +- cpp/tests/dictionary/slice_test.cpp | 15 +- cpp/tests/groupby/argmax_tests.cpp | 5 +- cpp/tests/groupby/argmin_tests.cpp | 7 +- cpp/tests/groupby/collect_set_tests.cpp | 4 +- cpp/tests/groupby/correlation_tests.cpp | 8 +- cpp/tests/groupby/count_scan_tests.cpp | 4 +- cpp/tests/groupby/count_tests.cpp | 7 +- cpp/tests/groupby/covariance_tests.cpp | 8 +- cpp/tests/groupby/groupby_test_util.cpp | 4 +- cpp/tests/groupby/groups_tests.cpp | 5 +- cpp/tests/groupby/keys_tests.cpp | 8 +- cpp/tests/groupby/m2_tests.cpp | 4 +- cpp/tests/groupby/max_scan_tests.cpp | 4 +- cpp/tests/groupby/max_tests.cpp | 25 +- cpp/tests/groupby/mean_tests.cpp | 7 +- cpp/tests/groupby/median_tests.cpp | 7 +- cpp/tests/groupby/merge_lists_tests.cpp | 4 +- cpp/tests/groupby/merge_m2_tests.cpp | 6 +- cpp/tests/groupby/merge_sets_tests.cpp | 4 +- cpp/tests/groupby/min_scan_tests.cpp | 4 +- cpp/tests/groupby/min_tests.cpp | 25 +- cpp/tests/groupby/nth_element_tests.cpp | 40 +- cpp/tests/groupby/nunique_tests.cpp | 19 +- cpp/tests/groupby/product_scan_tests.cpp | 2 +- cpp/tests/groupby/product_tests.cpp | 4 +- cpp/tests/groupby/quantile_tests.cpp | 7 +- cpp/tests/groupby/rank_scan_tests.cpp | 12 +- cpp/tests/groupby/replace_nulls_tests.cpp | 10 +- cpp/tests/groupby/shift_tests.cpp | 23 +- cpp/tests/groupby/std_tests.cpp | 12 +- cpp/tests/groupby/sum_of_squares_tests.cpp | 7 +- cpp/tests/groupby/sum_scan_tests.cpp | 4 +- cpp/tests/groupby/sum_tests.cpp | 5 +- cpp/tests/groupby/var_tests.cpp | 12 +- cpp/tests/hashing/md5_test.cpp | 32 +- cpp/tests/hashing/murmurhash3_x86_32_test.cpp | 106 ++++- cpp/tests/hashing/sha1_test.cpp | 8 +- cpp/tests/hashing/sha224_test.cpp | 8 +- cpp/tests/hashing/sha256_test.cpp | 8 +- cpp/tests/hashing/sha384_test.cpp | 8 +- cpp/tests/hashing/sha512_test.cpp | 8 +- cpp/tests/interop/dlpack_test.cpp | 2 +- cpp/tests/interop/from_arrow_device_test.cpp | 14 +- cpp/tests/interop/from_arrow_host_test.cpp | 6 +- cpp/tests/interop/from_arrow_test.cpp | 43 +- cpp/tests/interop/nanoarrow_utils.hpp | 14 +- cpp/tests/interop/to_arrow_device_test.cpp | 26 +- cpp/tests/io/csv_test.cpp | 4 +- cpp/tests/io/json_chunked_reader.cpp | 4 +- .../io/json_quote_normalization_test.cpp | 2 +- cpp/tests/io/json_test.cpp | 4 +- cpp/tests/io/json_tree.cpp | 8 +- cpp/tests/io/orc_chunked_reader_test.cu | 4 +- cpp/tests/io/orc_test.cpp | 8 +- cpp/tests/io/parquet_chunked_writer_test.cpp | 36 +- cpp/tests/io/parquet_reader_test.cpp | 54 ++- cpp/tests/io/parquet_v2_test.cpp | 79 ++-- cpp/tests/io/parquet_writer_test.cpp | 20 +- cpp/tests/join/distinct_join_tests.cpp | 76 ++-- cpp/tests/join/join_tests.cpp | 342 +++++++------- cpp/tests/join/semi_anti_join_tests.cpp | 43 +- cpp/tests/json/json_tests.cpp | 6 +- .../large_strings/large_strings_fixture.cpp | 9 +- cpp/tests/lists/contains_tests.cpp | 2 +- cpp/tests/lists/count_elements_tests.cpp | 10 +- cpp/tests/lists/explode_tests.cpp | 68 +-- cpp/tests/lists/sort_lists_tests.cpp | 8 +- cpp/tests/merge/merge_dictionary_test.cpp | 18 +- cpp/tests/merge/merge_string_test.cpp | 63 ++- .../partitioning/hash_partition_test.cpp | 2 +- cpp/tests/partitioning/round_robin_test.cpp | 73 +-- .../quantiles/percentile_approx_test.cpp | 11 +- cpp/tests/quantiles/quantile_test.cpp | 2 +- cpp/tests/quantiles/quantiles_test.cpp | 12 +- cpp/tests/reductions/collect_ops_tests.cpp | 47 +- cpp/tests/reductions/list_rank_test.cpp | 85 +++- cpp/tests/reductions/reduction_tests.cpp | 131 +++--- cpp/tests/reductions/scan_tests.cpp | 15 +- .../reductions/segmented_reduction_tests.cpp | 69 +-- cpp/tests/reshape/byte_cast_tests.cpp | 16 +- cpp/tests/rolling/collect_ops_test.cpp | 30 +- cpp/tests/rolling/grouped_rolling_test.cpp | 110 +++-- .../rolling/range_rolling_window_test.cpp | 24 +- cpp/tests/round/round_tests.cpp | 5 +- cpp/tests/scalar/scalar_test.cpp | 4 +- cpp/tests/search/search_dictionary_test.cpp | 30 +- cpp/tests/sort/is_sorted_tests.cpp | 8 +- cpp/tests/sort/rank_test.cpp | 91 ++-- cpp/tests/sort/stable_sort_tests.cpp | 8 +- .../distinct_count_tests.cpp | 37 +- .../stream_compaction/distinct_tests.cpp | 4 +- .../stream_compaction/drop_nans_tests.cpp | 38 +- .../stream_compaction/drop_nulls_tests.cpp | 67 +-- .../stable_distinct_tests.cpp | 4 +- cpp/tests/stream_compaction/unique_tests.cpp | 72 +-- cpp/tests/streams/interop_test.cpp | 1 + cpp/tests/streams/io/orc_test.cpp | 4 +- cpp/tests/streams/io/parquet_test.cpp | 4 +- cpp/tests/streams/lists_test.cpp | 5 +- cpp/tests/streams/reduction_test.cpp | 16 +- cpp/tests/streams/replace_test.cpp | 9 +- cpp/tests/streams/strings/filter_test.cpp | 4 +- cpp/tests/strings/case_tests.cpp | 50 ++- cpp/tests/strings/chars_types_tests.cpp | 51 ++- .../strings/combine/concatenate_tests.cpp | 11 +- .../strings/combine/join_strings_tests.cpp | 6 +- cpp/tests/strings/contains_tests.cpp | 16 +- cpp/tests/strings/datetime_tests.cpp | 6 +- cpp/tests/strings/extract_tests.cpp | 23 +- cpp/tests/strings/fill_tests.cpp | 6 +- cpp/tests/strings/find_multiple_tests.cpp | 2 +- cpp/tests/strings/find_tests.cpp | 102 +++-- cpp/tests/strings/findall_tests.cpp | 6 +- cpp/tests/strings/fixed_point_tests.cpp | 6 +- cpp/tests/strings/integers_tests.cpp | 24 +- cpp/tests/strings/ipv4_tests.cpp | 7 +- cpp/tests/strings/like_tests.cpp | 7 +- cpp/tests/strings/pad_tests.cpp | 5 +- cpp/tests/strings/replace_regex_tests.cpp | 6 +- cpp/tests/strings/replace_tests.cpp | 12 +- cpp/tests/strings/reverse_tests.cpp | 18 +- cpp/tests/strings/slice_tests.cpp | 8 +- cpp/tests/strings/split_tests.cpp | 42 +- cpp/tests/strings/strip_tests.cpp | 5 +- cpp/tests/strings/translate_tests.cpp | 4 +- cpp/tests/structs/structs_column_tests.cpp | 2 +- cpp/tests/structs/utilities_tests.cpp | 4 +- cpp/tests/table/row_operators_tests.cpp | 8 +- cpp/tests/text/bpe_tests.cpp | 2 +- cpp/tests/text/jaccard_tests.cpp | 15 +- cpp/tests/text/normalize_tests.cpp | 6 +- cpp/tests/text/replace_tests.cpp | 2 +- cpp/tests/text/stemmer_tests.cpp | 2 +- cpp/tests/text/subword_tests.cpp | 2 +- cpp/tests/text/tokenize_tests.cpp | 6 +- cpp/tests/transform/nans_to_null_test.cpp | 4 +- cpp/tests/transform/one_hot_encode_tests.cpp | 9 +- cpp/tests/unary/cast_tests.cpp | 15 +- cpp/tests/unary/math_ops_test.cpp | 3 +- cpp/tests/utilities/column_utilities.cu | 2 +- cpp/tests/utilities/identify_stream_usage.cpp | 2 +- cpp/tests/utilities_tests/logger_tests.cpp | 4 +- cpp/tests/utilities_tests/type_list_tests.cpp | 54 +-- java/src/main/native/include/jni_utils.hpp | 26 +- java/src/main/native/src/ColumnVectorJni.cpp | 14 +- java/src/main/native/src/ColumnViewJni.cpp | 44 +- java/src/main/native/src/RmmJni.cpp | 8 +- java/src/main/native/src/ScalarJni.cpp | 4 +- java/src/main/native/src/TableJni.cpp | 28 +- .../main/native/src/jni_writer_data_sink.hpp | 4 +- 261 files changed, 2911 insertions(+), 2151 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4cdcac88091..cc08b832e69 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,12 +56,20 @@ repos: - id: clang-format types_or: [c, c++, cuda] args: ["-fallback-style=none", "-style=file", "-i"] + exclude: | + (?x)^( + ^cpp/src/io/parquet/ipc/Schema_generated.h| + ^cpp/src/io/parquet/ipc/Message_generated.h| + ^cpp/include/cudf_test/cxxopts.hpp| + ) - repo: https://github.com/sirosen/texthooks rev: 0.6.6 hooks: - id: fix-smartquotes exclude: | (?x)^( + ^cpp/src/io/parquet/ipc/Schema_generated.h| + ^cpp/src/io/parquet/ipc/Message_generated.h| ^cpp/include/cudf_test/cxxopts.hpp| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*| ^python/cudf/cudf/tests/text/test_text_methods.py diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 26916e49012..918271e3e4f 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -478,7 +478,10 @@ class operation : public expression { * * @return Vector of operands */ - std::vector> get_operands() const { return operands; } + [[nodiscard]] std::vector> get_operands() const + { + return operands; + } /** * @copydoc expression::accept diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 19722d127cb..787e9c2c479 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -442,7 +442,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return string_view instance representing this element at this index */ template )> - __device__ T element(size_type element_index) const noexcept + __device__ [[nodiscard]] T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset char const* d_strings = static_cast(_data); @@ -501,7 +501,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return dictionary32 instance representing this element at this index */ template )> - __device__ T element(size_type element_index) const noexcept + __device__ [[nodiscard]] T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset auto const indices = d_children[0]; @@ -519,7 +519,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return numeric::fixed_point representing the element at this index */ template ())> - __device__ T element(size_type element_index) const noexcept + __device__ [[nodiscard]] T element(size_type element_index) const noexcept { using namespace numeric; using rep = typename T::rep; @@ -858,7 +858,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { */ [[nodiscard]] __device__ device_span children() const noexcept { - return device_span(d_children, _num_children); + return {d_children, static_cast(_num_children)}; } /** @@ -1032,7 +1032,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @return Reference to the element at the specified index */ template ())> - __device__ T& element(size_type element_index) const noexcept + __device__ [[nodiscard]] T& element(size_type element_index) const noexcept { return data()[element_index]; } diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 87c0f8ec7f1..edee83783b8 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -24,6 +24,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -510,7 +511,7 @@ class quantile_aggregation final : public groupby_aggregation, public reduce_agg void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } private: - size_t hash_impl() const + [[nodiscard]] size_t hash_impl() const { return std::hash{}(static_cast(_interpolation)) ^ std::accumulate( @@ -596,7 +597,10 @@ class nunique_aggregation final : public groupby_aggregation, void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } private: - size_t hash_impl() const { return std::hash{}(static_cast(_null_handling)); } + [[nodiscard]] size_t hash_impl() const + { + return std::hash{}(static_cast(_null_handling)); + } }; /** @@ -638,7 +642,7 @@ class nth_element_aggregation final : public groupby_aggregation, void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } private: - size_t hash_impl() const + [[nodiscard]] size_t hash_impl() const { return std::hash{}(_n) ^ std::hash{}(static_cast(_null_handling)); } @@ -763,7 +767,10 @@ class collect_list_aggregation final : public rolling_aggregation, void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } private: - size_t hash_impl() const { return std::hash{}(static_cast(_null_handling)); } + [[nodiscard]] size_t hash_impl() const + { + return std::hash{}(static_cast(_null_handling)); + } }; /** @@ -813,7 +820,7 @@ class collect_set_aggregation final : public rolling_aggregation, void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } protected: - size_t hash_impl() const + [[nodiscard]] size_t hash_impl() const { return std::hash{}(static_cast(_null_handling) ^ static_cast(_nulls_equal) ^ static_cast(_nans_equal)); @@ -866,10 +873,10 @@ class lead_lag_aggregation final : public rolling_aggregation { class udf_aggregation final : public rolling_aggregation { public: udf_aggregation(aggregation::Kind type, - std::string const& user_defined_aggregator, + std::string user_defined_aggregator, data_type output_type) : aggregation{type}, - _source{user_defined_aggregator}, + _source{std::move(user_defined_aggregator)}, _operator_name{(type == aggregation::PTX) ? "rolling_udf_ptx" : "rolling_udf_cuda"}, _function_name{"rolling_udf"}, _output_type{output_type} @@ -973,7 +980,7 @@ class merge_sets_aggregation final : public groupby_aggregation, public reduce_a void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } protected: - size_t hash_impl() const + [[nodiscard]] size_t hash_impl() const { return std::hash{}(static_cast(_nulls_equal) ^ static_cast(_nans_equal)); } @@ -1046,7 +1053,7 @@ class covariance_aggregation final : public groupby_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } protected: - size_t hash_impl() const + [[nodiscard]] size_t hash_impl() const { return std::hash{}(_min_periods) ^ std::hash{}(_ddof); } @@ -1088,7 +1095,7 @@ class correlation_aggregation final : public groupby_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } protected: - size_t hash_impl() const + [[nodiscard]] size_t hash_impl() const { return std::hash{}(static_cast(_type)) ^ std::hash{}(_min_periods); } diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp index de00b61cdca..1467ed1aa67 100644 --- a/cpp/include/cudf/detail/contiguous_split.hpp +++ b/cpp/include/cudf/detail/contiguous_split.hpp @@ -104,7 +104,7 @@ class metadata_builder { * * @returns A vector containing the serialized column metadata */ - std::vector build() const; + [[nodiscard]] std::vector build() const; /** * @brief Clear the internal buffer containing all added metadata. diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh index 32df13104e0..308fd188b09 100644 --- a/cpp/include/cudf/detail/normalizing_iterator.cuh +++ b/cpp/include/cudf/detail/normalizing_iterator.cuh @@ -51,7 +51,7 @@ struct alignas(16) base_normalator { */ CUDF_HOST_DEVICE inline Derived& operator++() { - Derived& derived = static_cast(*this); + auto& derived = static_cast(*this); derived.p_ += width_; return derived; } @@ -71,7 +71,7 @@ struct alignas(16) base_normalator { */ CUDF_HOST_DEVICE inline Derived& operator--() { - Derived& derived = static_cast(*this); + auto& derived = static_cast(*this); derived.p_ -= width_; return derived; } @@ -91,7 +91,7 @@ struct alignas(16) base_normalator { */ CUDF_HOST_DEVICE inline Derived& operator+=(difference_type offset) { - Derived& derived = static_cast(*this); + auto& derived = static_cast(*this); derived.p_ += offset * width_; return derived; } @@ -121,7 +121,7 @@ struct alignas(16) base_normalator { */ CUDF_HOST_DEVICE inline Derived& operator-=(difference_type offset) { - Derived& derived = static_cast(*this); + auto& derived = static_cast(*this); derived.p_ -= offset * width_; return derived; } diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp index e736514ac29..beedc009c84 100644 --- a/cpp/include/cudf/detail/structs/utilities.hpp +++ b/cpp/include/cudf/detail/structs/utilities.hpp @@ -25,6 +25,8 @@ #include #include +#include + namespace cudf::structs::detail { enum class column_nullability { @@ -112,12 +114,12 @@ class flattened_table { * @param columns_ Newly allocated columns to back the table_view * @param nullable_data_ Newly generated temporary data that needs to be kept alive */ - flattened_table(table_view const& flattened_columns_, + flattened_table(table_view flattened_columns_, std::vector const& orders_, std::vector const& null_orders_, std::vector>&& columns_, temporary_nullable_data&& nullable_data_) - : _flattened_columns{flattened_columns_}, + : _flattened_columns{std::move(flattened_columns_)}, _orders{orders_}, _null_orders{null_orders_}, _columns{std::move(columns_)}, @@ -170,11 +172,11 @@ class flattened_table { * orders, flattened null precedence, alongside the supporting columns and device_buffers * for the flattened table. */ -[[nodiscard]] std::unique_ptr flatten_nested_columns( +[[nodiscard]] std::unique_ptr flatten_nested_columns( table_view const& input, - std::vector const& column_order, - std::vector const& null_precedence, - column_nullability nullability, + std::vector const& column_order, + std::vector const& null_precedence, + cudf::structs::detail::column_nullability nullability, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); @@ -194,11 +196,11 @@ class flattened_table { * @param mr Device memory resource used to allocate new device memory * @return A new column with potentially new null mask */ -[[nodiscard]] std::unique_ptr superimpose_nulls(bitmask_type const* null_mask, - size_type null_count, - std::unique_ptr&& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +[[nodiscard]] std::unique_ptr superimpose_nulls(bitmask_type const* null_mask, + cudf::size_type null_count, + std::unique_ptr&& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Push down nulls from the given input column into its children columns, using bitwise AND. diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index 6a115177ab5..2d14d0306cd 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -82,7 +82,7 @@ class rmm_host_allocator { using size_type = std::size_t; ///< The type used for the size of the allocation using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers - typedef cuda::std::true_type propagate_on_container_move_assignment; + using propagate_on_container_move_assignment = cuda::std::true_type; /** * @brief converts a `rmm_host_allocator` to `rmm_host_allocator` @@ -147,7 +147,7 @@ class rmm_host_allocator { * @return The maximum number of objects that may be allocated * by a single call to \p allocate(). */ - constexpr inline size_type max_size() const + [[nodiscard]] constexpr inline size_type max_size() const { return (std::numeric_limits::max)() / sizeof(T); } diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp index e19cc3ec2f7..64c1d4ae514 100644 --- a/cpp/include/cudf/detail/utilities/stream_pool.hpp +++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp @@ -73,7 +73,7 @@ class cuda_stream_pool { * * @return the number of stream objects in the pool */ - virtual std::size_t get_stream_pool_size() const = 0; + [[nodiscard]] virtual std::size_t get_stream_pool_size() const = 0; }; /** diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index e39d75757e8..6c3c3b4da07 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -291,14 +291,14 @@ class fixed_point { * * @return The underlying value of the `fixed_point` number */ - CUDF_HOST_DEVICE inline rep value() const { return _value; } + CUDF_HOST_DEVICE [[nodiscard]] inline rep value() const { return _value; } /** * @brief Method that returns the scale of the `fixed_point` number * * @return The scale of the `fixed_point` number */ - CUDF_HOST_DEVICE inline scale_type scale() const { return _scale; } + CUDF_HOST_DEVICE [[nodiscard]] inline scale_type scale() const { return _scale; } /** * @brief Explicit conversion operator to `bool` @@ -573,7 +573,7 @@ class fixed_point { * @param scale The `scale` of the returned `fixed_point` number * @return `fixed_point` number with a new `scale` */ - CUDF_HOST_DEVICE inline fixed_point rescaled(scale_type scale) const + CUDF_HOST_DEVICE [[nodiscard]] inline fixed_point rescaled(scale_type scale) const { if (scale == _scale) { return *this; } Rep const value = detail::shift(_value, scale_type{scale - _scale}); diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index f3ff0009d5c..56ec62fa6e1 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -40,6 +40,8 @@ #include +#include + struct DLManagedTensor; struct ArrowDeviceArray; @@ -121,7 +123,7 @@ struct column_metadata { * * @param _name Name of the column */ - column_metadata(std::string const& _name) : name(_name) {} + column_metadata(std::string _name) : name(std::move(_name)) {} column_metadata() = default; }; diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp index 8043ecf5422..906d48f636b 100644 --- a/cpp/include/cudf/interop/detail/arrow.hpp +++ b/cpp/include/cudf/interop/detail/arrow.hpp @@ -24,8 +24,12 @@ #define ARROW_C_DEVICE_DATA_INTERFACE // Device type for the allocated memory -typedef int32_t ArrowDeviceType; +using ArrowDeviceType = int32_t; +// The Arrow spec specifies using macros rather than enums here to avoid being +// susceptible to changes in the underlying type chosen by the compiler, but +// clang-tidy doesn't like this. +// NOLINTBEGIN // CPU device, same as using ArrowArray directly #define ARROW_DEVICE_CPU 1 // CUDA GPU Device @@ -34,6 +38,7 @@ typedef int32_t ArrowDeviceType; #define ARROW_DEVICE_CUDA_HOST 3 // CUDA managed/unified memory allocated by cudaMallocManaged #define ARROW_DEVICE_CUDA_MANAGED 13 +// NOLINTEND struct ArrowDeviceArray { struct ArrowArray array; diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp index 5f79f05c5a1..d7a48c34e12 100644 --- a/cpp/include/cudf/io/arrow_io_source.hpp +++ b/cpp/include/cudf/io/arrow_io_source.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include +#include namespace cudf::io { /** @@ -49,7 +50,10 @@ class arrow_io_source : public datasource { * * @param file The `arrow` object from which the data is read */ - explicit arrow_io_source(std::shared_ptr file) : arrow_file(file) {} + explicit arrow_io_source(std::shared_ptr file) + : arrow_file(std::move(file)) + { + } /** * @brief Returns a buffer with a subset of data from the `arrow` source. diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index a20f75cecd7..68bb7fba00e 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -431,7 +432,8 @@ class csv_reader_options { * * @return Per-column types */ - std::variant, std::map> const& get_dtypes() const + [[nodiscard]] std::variant, std::map> const& + get_dtypes() const { return _dtypes; } @@ -441,49 +443,49 @@ class csv_reader_options { * * @return Additional values to recognize as boolean true values */ - std::vector const& get_true_values() const { return _true_values; } + [[nodiscard]] std::vector const& get_true_values() const { return _true_values; } /** * @brief Returns additional values to recognize as boolean false values. * * @return Additional values to recognize as boolean false values */ - std::vector const& get_false_values() const { return _false_values; } + [[nodiscard]] std::vector const& get_false_values() const { return _false_values; } /** * @brief Returns additional values to recognize as null values. * * @return Additional values to recognize as null values */ - std::vector const& get_na_values() const { return _na_values; } + [[nodiscard]] std::vector const& get_na_values() const { return _na_values; } /** * @brief Whether to keep the built-in default NA values. * * @return `true` if the built-in default NA values are kept */ - bool is_enabled_keep_default_na() const { return _keep_default_na; } + [[nodiscard]] bool is_enabled_keep_default_na() const { return _keep_default_na; } /** * @brief Whether to disable null filter. * * @return `true` if null filter is enabled */ - bool is_enabled_na_filter() const { return _na_filter; } + [[nodiscard]] bool is_enabled_na_filter() const { return _na_filter; } /** * @brief Whether to parse dates as DD/MM versus MM/DD. * * @return True if dates are parsed as DD/MM, false if MM/DD */ - bool is_enabled_dayfirst() const { return _dayfirst; } + [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; } /** * @brief Returns timestamp_type to which all timestamp columns will be cast. * * @return timestamp_type to which all timestamp columns will be cast */ - data_type get_timestamp_type() const { return _timestamp_type; } + [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; } /** * @brief Sets compression format of the source. @@ -1399,8 +1401,8 @@ class csv_writer_options { * @param sink The sink used for writer output * @param table Table to be written to output */ - explicit csv_writer_options(sink_info const& sink, table_view const& table) - : _sink(sink), _table(table), _rows_per_chunk(table.num_rows()) + explicit csv_writer_options(sink_info sink, table_view const& table) + : _sink(std::move(sink)), _table(table), _rows_per_chunk(table.num_rows()) { } diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 978216d971e..21c870cb75e 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -160,7 +160,7 @@ class chunked_reader : private reader { * destructor needs to be defined in a separate source file which can access to that object's * declaration. */ - ~chunked_reader(); + ~chunked_reader() override; /** * @copydoc cudf::io::chunked_parquet_reader::has_next diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 65ba8f25577..8de690482f9 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -166,9 +167,9 @@ class json_reader_options { * * @returns Data types of the columns */ - std::variant, - std::map, - std::map> const& + [[nodiscard]] std::variant, + std::map, + std::map> const& get_dtypes() const { return _dtypes; @@ -179,28 +180,28 @@ class json_reader_options { * * @return Compression format of the source */ - compression_type get_compression() const { return _compression; } + [[nodiscard]] compression_type get_compression() const { return _compression; } /** * @brief Returns number of bytes to skip from source start. * * @return Number of bytes to skip from source start */ - size_t get_byte_range_offset() const { return _byte_range_offset; } + [[nodiscard]] size_t get_byte_range_offset() const { return _byte_range_offset; } /** * @brief Returns number of bytes to read. * * @return Number of bytes to read */ - size_t get_byte_range_size() const { return _byte_range_size; } + [[nodiscard]] size_t get_byte_range_size() const { return _byte_range_size; } /** * @brief Returns number of bytes to read with padding. * * @return Number of bytes to read with padding */ - size_t get_byte_range_size_with_padding() const + [[nodiscard]] size_t get_byte_range_size_with_padding() const { if (_byte_range_size == 0) { return 0; @@ -214,7 +215,7 @@ class json_reader_options { * * @return Number of bytes to pad */ - size_t get_byte_range_padding() const + [[nodiscard]] size_t get_byte_range_padding() const { auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes); @@ -236,67 +237,68 @@ class json_reader_options { * * @return Delimiter separating records in JSON lines */ - char get_delimiter() const { return _delimiter; } + [[nodiscard]] char get_delimiter() const { return _delimiter; } /** * @brief Whether to read the file as a json object per line. * * @return `true` if reading the file as a json object per line */ - bool is_enabled_lines() const { return _lines; } + [[nodiscard]] bool is_enabled_lines() const { return _lines; } /** * @brief Whether to parse mixed types as a string column. * * @return `true` if mixed types are parsed as a string column */ - bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; } + [[nodiscard]] bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; } /** * @brief Whether to prune columns on read, selected based on the @ref set_dtypes option. * * When set as true, if the reader options include @ref set_dtypes, then * the reader will only return those columns which are mentioned in @ref set_dtypes. - * If false, then all columns are returned, independent of the @ref set_dtypes setting. + * If false, then all columns are returned, independent of the @ref set_dtypes + * setting. * * @return True if column pruning is enabled */ - bool is_enabled_prune_columns() const { return _prune_columns; } + [[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; } /** * @brief Whether to parse dates as DD/MM versus MM/DD. * * @returns true if dates are parsed as DD/MM, false if MM/DD */ - bool is_enabled_dayfirst() const { return _dayfirst; } + [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; } /** * @brief Whether the reader should keep quotes of string values. * * @returns true if the reader should keep quotes, false otherwise */ - bool is_enabled_keep_quotes() const { return _keep_quotes; } + [[nodiscard]] bool is_enabled_keep_quotes() const { return _keep_quotes; } /** * @brief Whether the reader should normalize single quotes around strings * * @returns true if the reader should normalize single quotes, false otherwise */ - bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; } + [[nodiscard]] bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; } /** * @brief Whether the reader should normalize unquoted whitespace characters * * @returns true if the reader should normalize whitespace, false otherwise */ - bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; } + [[nodiscard]] bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; } /** * @brief Queries the JSON reader's behavior on invalid JSON lines. * * @returns An enum that specifies the JSON reader's behavior on invalid JSON lines. */ - json_recovery_mode_t recovery_mode() const { return _recovery_mode; } + [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; } /** * @brief Set data types for columns to be read. @@ -717,8 +719,8 @@ class json_writer_options { * @param sink The sink used for writer output * @param table Table to be written to output */ - explicit json_writer_options(sink_info const& sink, table_view const& table) - : _sink(sink), _table(table), _rows_per_chunk(table.num_rows()) + explicit json_writer_options(sink_info sink, table_view table) + : _sink(std::move(sink)), _table(std::move(table)), _rows_per_chunk(table.num_rows()) { } diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 8140f8897b7..623c1d9fc72 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include namespace cudf { @@ -125,7 +126,7 @@ class orc_reader_options { * * @return Number of rows to skip from the start */ - int64_t get_skip_rows() const { return _skip_rows; } + [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; } /** * @brief Returns number of row to read. @@ -133,35 +134,38 @@ class orc_reader_options { * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file * is read until the end) */ - std::optional const& get_num_rows() const { return _num_rows; } + [[nodiscard]] std::optional const& get_num_rows() const { return _num_rows; } /** * @brief Whether to use row index to speed-up reading. * * @return `true` if row index is used to speed-up reading */ - bool is_enabled_use_index() const { return _use_index; } + [[nodiscard]] bool is_enabled_use_index() const { return _use_index; } /** * @brief Whether to use numpy-compatible dtypes. * * @return `true` if numpy-compatible dtypes are used */ - bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; } + [[nodiscard]] bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; } /** * @brief Returns timestamp type to which timestamp column will be cast. * * @return Timestamp type to which timestamp column will be cast */ - data_type get_timestamp_type() const { return _timestamp_type; } + [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; } /** * @brief Returns fully qualified names of columns that should be read as 128-bit Decimal. * * @return Fully qualified names of columns that should be read as 128-bit Decimal */ - std::vector const& get_decimal128_columns() const { return _decimal128_columns; } + [[nodiscard]] std::vector const& get_decimal128_columns() const + { + return _decimal128_columns; + } // Setters @@ -603,8 +607,8 @@ class orc_writer_options { * @param sink The sink used for writer output * @param table Table to be written to output */ - explicit orc_writer_options(sink_info const& sink, table_view const& table) - : _sink(sink), _table(table) + explicit orc_writer_options(sink_info sink, table_view table) + : _sink(std::move(sink)), _table(std::move(table)) { } @@ -676,7 +680,7 @@ class orc_writer_options { * * @return Row index stride */ - auto get_row_index_stride() const + [[nodiscard]] auto get_row_index_stride() const { auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows()); return unaligned_stride - unaligned_stride % 8; @@ -1048,7 +1052,7 @@ class chunked_orc_writer_options { * * @param sink The sink used for writer output */ - chunked_orc_writer_options(sink_info const& sink) : _sink(sink) {} + chunked_orc_writer_options(sink_info sink) : _sink(std::move(sink)) {} public: /** @@ -1107,7 +1111,7 @@ class chunked_orc_writer_options { * * @return Row index stride */ - auto get_row_index_stride() const + [[nodiscard]] auto get_row_index_stride() const { auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows()); return unaligned_stride - unaligned_stride % 8; diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 51eeed5b721..431f14af522 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -187,7 +187,7 @@ class parquet_reader_options { * * @return Timestamp type used to cast timestamp columns */ - data_type get_timestamp_type() const { return _timestamp_type; } + [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; } /** * @brief Sets names of the columns to be read. @@ -626,7 +626,7 @@ class parquet_writer_options_base { * * @param sink The sink used for writer output */ - explicit parquet_writer_options_base(sink_info const& sink) : _sink(sink) {} + explicit parquet_writer_options_base(sink_info sink) : _sink(std::move(sink)) {} public: /** @@ -1287,7 +1287,7 @@ class chunked_parquet_writer_options : public parquet_writer_options_base { * * @param sink Sink used for writer output */ - explicit chunked_parquet_writer_options(sink_info const& sink); + explicit chunked_parquet_writer_options(sink_info sink); friend chunked_parquet_writer_options_builder; diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 150e997f533..0dab1c606de 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -30,6 +30,7 @@ #include #include #include +#include #include namespace cudf { @@ -247,10 +248,10 @@ struct column_name_info { * @param _is_nullable True if column is nullable * @param _is_binary True if column is binary data */ - column_name_info(std::string const& _name, + column_name_info(std::string _name, std::optional _is_nullable = std::nullopt, std::optional _is_binary = std::nullopt) - : name(_name), is_nullable(_is_nullable), is_binary(_is_binary) + : name(std::move(_name)), is_nullable(_is_nullable), is_binary(_is_binary) { } diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 825f758adbd..ba485bd6372 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -336,8 +336,8 @@ class hash_join { * the result of performing an inner join between two tables with `build` and `probe` * as the join keys . */ - std::pair>, - std::unique_ptr>> + [[nodiscard]] std::pair>, + std::unique_ptr>> inner_join(cudf::table_view const& probe, std::optional output_size = {}, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -359,10 +359,10 @@ class hash_join { * * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct * the result of performing a left join between two tables with `build` and `probe` - * as the join keys . + * as the join keys. */ - std::pair>, - std::unique_ptr>> + [[nodiscard]] std::pair>, + std::unique_ptr>> left_join(cudf::table_view const& probe, std::optional output_size = {}, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -386,8 +386,8 @@ class hash_join { * the result of performing a full join between two tables with `build` and `probe` * as the join keys . */ - std::pair>, - std::unique_ptr>> + [[nodiscard]] std::pair>, + std::unique_ptr>> full_join(cudf::table_view const& probe, std::optional output_size = {}, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -440,7 +440,7 @@ class hash_join { * @return The exact number of output when performing a full join between two tables with `build` * and `probe` as the join keys . */ - std::size_t full_join_size( + [[nodiscard]] std::size_t full_join_size( cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const; @@ -492,12 +492,12 @@ class distinct_hash_join { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned indices' device memory. * - * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to construct - * the result of performing an inner join between two tables with `build` and `probe` - * as the join keys. + * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to + * construct the result of performing an inner join between two tables + * with `build` and `probe` as the join keys. */ - std::pair>, - std::unique_ptr>> + [[nodiscard]] std::pair>, + std::unique_ptr>> inner_join(rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const; @@ -512,10 +512,11 @@ class distinct_hash_join { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device * memory. - * @return A `build_indices` column that can be used to construct the result of performing a left - * join between two tables with `build` and `probe` as the join keys. + * @return A `build_indices` column that can be used to construct the result of + * performing a left join between two tables with `build` and `probe` as the join + * keys. */ - std::unique_ptr> left_join( + [[nodiscard]] std::unique_ptr> left_join( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const; diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index da1d0d743a7..d78907b473a 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -187,7 +187,7 @@ class fixed_width_scalar : public scalar { * @param stream CUDA stream used for device memory operations. * @return Value of the scalar */ - T value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; + [[nodiscard]] T value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Returns a raw pointer to the value in device memory. @@ -199,7 +199,7 @@ class fixed_width_scalar : public scalar { * @brief Returns a const raw pointer to the value in device memory. * @return A const raw pointer to the value in device memory */ - T const* data() const; + [[nodiscard]] T const* data() const; protected: rmm::device_scalar _data; ///< device memory containing the value @@ -245,8 +245,8 @@ class numeric_scalar : public detail::fixed_width_scalar { static_assert(is_numeric(), "Unexpected non-numeric type."); public: - numeric_scalar() = delete; - ~numeric_scalar() = default; + numeric_scalar() = delete; + ~numeric_scalar() override = default; /** * @brief Move constructor for numeric_scalar. @@ -393,7 +393,7 @@ class fixed_point_scalar : public scalar { * @param stream CUDA stream used for device memory operations. * @return The value of the scalar */ - rep_type value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; + [[nodiscard]] rep_type value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Get the decimal32, decimal64 or decimal128. @@ -401,7 +401,8 @@ class fixed_point_scalar : public scalar { * @param stream CUDA stream used for device memory operations. * @return The decimal32, decimal64 or decimal128 value */ - T fixed_point_value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; + [[nodiscard]] T fixed_point_value( + rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Explicit conversion operator to get the value of the scalar on the host. @@ -418,7 +419,7 @@ class fixed_point_scalar : public scalar { * @brief Returns a const raw pointer to the value in device memory. * @return a const raw pointer to the value in device memory */ - rep_type const* data() const; + [[nodiscard]] rep_type const* data() const; protected: rmm::device_scalar _data; ///< device memory containing the value @@ -565,8 +566,8 @@ class chrono_scalar : public detail::fixed_width_scalar { static_assert(is_chrono(), "Unexpected non-chrono type"); public: - chrono_scalar() = delete; - ~chrono_scalar() = default; + chrono_scalar() = delete; + ~chrono_scalar() override = default; /** * @brief Move constructor for chrono_scalar. diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index bdf541f455f..95c86ae0f8a 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,35 +74,35 @@ struct regex_program { * * @return regex pattern as a string */ - std::string pattern() const; + [[nodiscard]] std::string pattern() const; /** * @brief Return the regex_flags used to create this instance * * @return regex flags setting */ - regex_flags flags() const; + [[nodiscard]] regex_flags flags() const; /** * @brief Return the capture_groups used to create this instance * * @return capture groups setting */ - capture_groups capture() const; + [[nodiscard]] capture_groups capture() const; /** * @brief Return the number of instructions in this instance * * @return Number of instructions */ - int32_t instructions_count() const; + [[nodiscard]] int32_t instructions_count() const; /** * @brief Return the number of capture groups in this instance * * @return Number of groups */ - int32_t groups_count() const; + [[nodiscard]] int32_t groups_count() const; /** * @brief Return the size of the working memory for the regex execution @@ -110,7 +110,7 @@ struct regex_program { * @param num_strings Number of strings for computation * @return Size of the working memory in bytes */ - std::size_t compute_working_memory_size(int32_t num_strings) const; + [[nodiscard]] std::size_t compute_working_memory_size(int32_t num_strings) const; ~regex_program(); diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 74df1ea1887..93cc787683b 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -110,7 +110,7 @@ static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; * * @return An empty string */ -CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); } +CUDF_HOST_DEVICE inline string_view string_view::min() { return {}; } /** * @brief Return maximum value associated with the string type @@ -130,7 +130,7 @@ CUDF_HOST_DEVICE inline string_view string_view::max() CUDF_CUDA_TRY( cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel)); #endif - return string_view(psentinel, 4); + return {psentinel, 4}; } __device__ inline size_type string_view::length() const @@ -439,7 +439,7 @@ __device__ inline string_view string_view::substr(size_type pos, size_type count auto const itr = begin() + pos; auto const spos = itr.byte_offset(); auto const epos = count >= 0 ? (itr + count).byte_offset() : size_bytes(); - return string_view(data() + spos, epos - spos); + return {data() + spos, epos - spos}; } __device__ inline size_type string_view::character_offset(size_type bytepos) const diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index 8efe6eb8c72..c4f14af53fb 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -144,7 +144,7 @@ class table { */ template - table_view select(InputIterator begin, InputIterator end) const + [[nodiscard]] table_view select(InputIterator begin, InputIterator end) const { std::vector columns(std::distance(begin, end)); std::transform( diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index ad12b1eef4e..a71e0558dec 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -123,7 +123,7 @@ class table_view_base { * @param column_index The index of the desired column * @return A reference to the desired column */ - ColumnView const& column(size_type column_index) const; + [[nodiscard]] ColumnView const& column(size_type column_index) const; /** * @brief Returns the number of columns @@ -224,7 +224,7 @@ class table_view : public detail::table_view_base { * specified by the elements of `column_indices` */ template - table_view select(InputIterator begin, InputIterator end) const + [[nodiscard]] table_view select(InputIterator begin, InputIterator end) const { std::vector columns(std::distance(begin, end)); std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); }); diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 719d44a9ab3..f019f516b84 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -48,7 +48,7 @@ struct stacktrace_recorder { * * @return The pointer to a null-terminated string storing the output stacktrace */ - char const* stacktrace() const { return _stacktrace.c_str(); } + [[nodiscard]] char const* stacktrace() const { return _stacktrace.c_str(); } protected: std::string const _stacktrace; //!< The whole stacktrace stored as one string. @@ -78,7 +78,7 @@ struct logic_error : public std::logic_error, public stacktrace_recorder { // TODO Add an error code member? This would be useful for translating an // exception to an error code in a pure-C API - ~logic_error() + ~logic_error() override { // Needed so that the first instance of the implicit destructor for any TU isn't 'constructed' // from a host+device function marking the implicit version also as host+device @@ -106,7 +106,7 @@ struct cuda_error : public std::runtime_error, public stacktrace_recorder { * * @return CUDA error code */ - cudaError_t error_code() const { return _cudaError; } + [[nodiscard]] cudaError_t error_code() const { return _cudaError; } protected: cudaError_t _cudaError; //!< CUDA error code @@ -237,7 +237,7 @@ inline void throw_cuda_error(cudaError_t error, char const* file, unsigned int l // Calls cudaGetLastError to clear the error status. It is nearly certain that a fatal error // occurred if it still returns the same error after a cleanup. cudaGetLastError(); - auto const last = cudaFree(0); + auto const last = cudaFree(nullptr); auto const msg = std::string{"CUDA error encountered at: " + std::string{file} + ":" + std::to_string(line) + ": " + std::to_string(error) + " " + cudaGetErrorName(error) + " " + cudaGetErrorString(error)}; diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 47e92d61a9f..3b35e60e034 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ #include #include #include +#include namespace cudf { /** @@ -90,7 +91,7 @@ class span_base { * * @return Reference to the first element in the span */ - constexpr reference front() const { return _data[0]; } + [[nodiscard]] constexpr reference front() const { return _data[0]; } // not noexcept due to undefined behavior when size = 0 /** * @brief Returns a reference to the last element in the span. @@ -99,7 +100,7 @@ class span_base { * * @return Reference to the last element in the span */ - constexpr reference back() const { return _data[_size - 1]; } + [[nodiscard]] constexpr reference back() const { return _data[_size - 1]; } // not noexcept due to undefined behavior when idx < 0 || idx >= size /** * @brief Returns a reference to the idx-th element of the sequence. @@ -119,7 +120,7 @@ class span_base { * * @return An iterator to the first element of the span */ - constexpr iterator begin() const noexcept { return _data; } + [[nodiscard]] constexpr iterator begin() const noexcept { return _data; } /** * @brief Returns an iterator to the element following the last element of the span. * @@ -127,13 +128,13 @@ class span_base { * * @return An iterator to the element following the last element of the span */ - constexpr iterator end() const noexcept { return _data + _size; } + [[nodiscard]] constexpr iterator end() const noexcept { return _data + _size; } /** * @brief Returns a pointer to the beginning of the sequence. * * @return A pointer to the first element of the span */ - constexpr pointer data() const noexcept { return _data; } + [[nodiscard]] constexpr pointer data() const noexcept { return _data; } /** * @brief Returns the number of elements in the span. @@ -160,7 +161,10 @@ class span_base { * @param count Number of elements from the beginning of this span to put in the subspan. * @return A subspan of the first N elements of the sequence */ - constexpr Derived first(size_type count) const noexcept { return Derived(_data, count); } + [[nodiscard]] constexpr Derived first(size_type count) const noexcept + { + return Derived(_data, count); + } /** * @brief Obtains a subspan consisting of the last N elements of the sequence @@ -168,7 +172,7 @@ class span_base { * @param count Number of elements from the end of this span to put in the subspan * @return A subspan of the last N elements of the sequence */ - constexpr Derived last(size_type count) const noexcept + [[nodiscard]] constexpr Derived last(size_type count) const noexcept { return Derived(_data + _size - count, count); } @@ -180,7 +184,7 @@ class span_base { * @param count The number of elements in the subspan * @return A subspan of the sequence, of requested count and offset */ - constexpr Derived subspan(size_type offset, size_type count) const noexcept + [[nodiscard]] constexpr Derived subspan(size_type offset, size_type count) const noexcept { return Derived(_data + offset, count); } @@ -365,7 +369,7 @@ class base_2dspan { * @param data Pointer to the data * @param size Size of the 2D span as pair */ - base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{size} {} + base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{std::move(size)} {} /** * @brief Returns a pointer to the beginning of the sequence. diff --git a/cpp/include/cudf/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp index 74a2531710b..c8c3eb097c4 100644 --- a/cpp/include/cudf/utilities/thread_pool.hpp +++ b/cpp/include/cudf/utilities/thread_pool.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -201,8 +201,8 @@ class thread_pool { running = false; destroy_threads(); thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency(); - threads.reset(new std::thread[thread_count]); - paused = was_paused; + threads = std::make_unique(thread_count); + paused = was_paused; create_threads(); running = true; } diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp index 37264c5a33c..95f4ac00a53 100644 --- a/cpp/include/cudf/wrappers/dictionary.hpp +++ b/cpp/include/cudf/wrappers/dictionary.hpp @@ -87,7 +87,7 @@ struct dictionary_wrapper { * * @return The value of this dictionary wrapper */ - CUDF_HOST_DEVICE inline value_type value() const { return _value; } + CUDF_HOST_DEVICE [[nodiscard]] inline value_type value() const { return _value; } /** * @brief Returns the maximum value of the value type. diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp index 62aa22c2788..840dba4f4ba 100644 --- a/cpp/include/cudf/wrappers/durations.hpp +++ b/cpp/include/cudf/wrappers/durations.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,13 +56,13 @@ using duration_us = cuda::std::chrono::duration; -static_assert(sizeof(duration_D) == sizeof(typename duration_D::rep), ""); -static_assert(sizeof(duration_h) == sizeof(typename duration_h::rep), ""); -static_assert(sizeof(duration_m) == sizeof(typename duration_m::rep), ""); -static_assert(sizeof(duration_s) == sizeof(typename duration_s::rep), ""); -static_assert(sizeof(duration_ms) == sizeof(typename duration_ms::rep), ""); -static_assert(sizeof(duration_us) == sizeof(typename duration_us::rep), ""); -static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep), ""); +static_assert(sizeof(duration_D) == sizeof(typename duration_D::rep)); +static_assert(sizeof(duration_h) == sizeof(typename duration_h::rep)); +static_assert(sizeof(duration_m) == sizeof(typename duration_m::rep)); +static_assert(sizeof(duration_s) == sizeof(typename duration_s::rep)); +static_assert(sizeof(duration_ms) == sizeof(typename duration_ms::rep)); +static_assert(sizeof(duration_us) == sizeof(typename duration_us::rep)); +static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep)); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp index 0341ac6ede4..5194a3e8f96 100644 --- a/cpp/include/cudf/wrappers/timestamps.hpp +++ b/cpp/include/cudf/wrappers/timestamps.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -73,13 +73,13 @@ using timestamp_us = detail::timestamp; */ using timestamp_ns = detail::timestamp; -static_assert(sizeof(timestamp_D) == sizeof(typename timestamp_D::rep), ""); -static_assert(sizeof(timestamp_h) == sizeof(typename timestamp_h::rep), ""); -static_assert(sizeof(timestamp_m) == sizeof(typename timestamp_m::rep), ""); -static_assert(sizeof(timestamp_s) == sizeof(typename timestamp_s::rep), ""); -static_assert(sizeof(timestamp_ms) == sizeof(typename timestamp_ms::rep), ""); -static_assert(sizeof(timestamp_us) == sizeof(typename timestamp_us::rep), ""); -static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep), ""); +static_assert(sizeof(timestamp_D) == sizeof(typename timestamp_D::rep)); +static_assert(sizeof(timestamp_h) == sizeof(typename timestamp_h::rep)); +static_assert(sizeof(timestamp_m) == sizeof(typename timestamp_m::rep)); +static_assert(sizeof(timestamp_s) == sizeof(typename timestamp_s::rep)); +static_assert(sizeof(timestamp_ms) == sizeof(typename timestamp_ms::rep)); +static_assert(sizeof(timestamp_us) == sizeof(typename timestamp_us::rep)); +static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep)); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp index 18f75bbc842..0e35ff64af4 100644 --- a/cpp/include/cudf_test/base_fixture.hpp +++ b/cpp/include/cudf_test/base_fixture.hpp @@ -66,7 +66,7 @@ class BaseFixtureWithParam : public ::testing::TestWithParam { * all tests inheriting from this fixture * @return pointer to memory resource */ - rmm::device_async_resource_ref mr() const { return _mr; } + [[nodiscard]] rmm::device_async_resource_ref mr() const { return _mr; } }; /** diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index dc873658abf..47d17988775 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1121,14 +1121,20 @@ class dictionary_column_wrapper : public detail::column_wrapper { * * @return column_view to keys column */ - column_view keys() const { return cudf::dictionary_column_view{wrapped->view()}.keys(); } + [[nodiscard]] column_view keys() const + { + return cudf::dictionary_column_view{wrapped->view()}.keys(); + } /** * @brief Access indices column view * * @return column_view to indices column */ - column_view indices() const { return cudf::dictionary_column_view{wrapped->view()}.indices(); } + [[nodiscard]] column_view indices() const + { + return cudf::dictionary_column_view{wrapped->view()}.indices(); + } /** * @brief Default constructor initializes an empty dictionary column of strings @@ -1792,7 +1798,10 @@ class lists_column_wrapper : public detail::column_wrapper { return {std::move(cols), std::move(stubs)}; } - column_view get_view() const { return root ? lists_column_view(*wrapped).child() : *wrapped; } + [[nodiscard]] column_view get_view() const + { + return root ? lists_column_view(*wrapped).child() : *wrapped; + } int depth = 0; bool root = false; diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp index cafde6ca7d5..5a077e86a0f 100644 --- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp +++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp @@ -110,7 +110,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res * @param other The other resource to compare to * @return Whether or not the two resources are equivalent */ - bool do_is_equal(device_memory_resource const& other) const noexcept override + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index ac31f9045fe..8ac1491547d 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -153,7 +153,7 @@ void binary_operation(mutable_column_view& out, cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."}) - ->configure_1d_max_occupancy(0, 0, 0, stream.value()) + ->configure_1d_max_occupancy(0, 0, nullptr, stream.value()) ->launch(out.size(), cudf::jit::get_data_ptr(out), cudf::jit::get_data_ptr(lhs), diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh index 43b4bd232c4..57113785a29 100644 --- a/cpp/src/binaryop/compiled/operation.cuh +++ b/cpp/src/binaryop/compiled/operation.cuh @@ -173,8 +173,8 @@ struct PMod { __device__ inline auto operator()(TypeLhs x, TypeRhs y) { using common_t = std::common_type_t; - common_t xconv = static_cast(x); - common_t yconv = static_cast(y); + auto xconv = static_cast(x); + auto yconv = static_cast(y); auto rem = xconv % yconv; if constexpr (std::is_signed_v) if (rem < 0) rem = (rem + yconv) % yconv; @@ -188,8 +188,8 @@ struct PMod { __device__ inline auto operator()(TypeLhs x, TypeRhs y) { using common_t = std::common_type_t; - common_t xconv = static_cast(x); - common_t yconv = static_cast(y); + auto xconv = static_cast(x); + auto yconv = static_cast(y); auto rem = std::fmod(xconv, yconv); if (rem < 0) rem = std::fmod(rem + yconv, yconv); return rem; diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp index 02f4e480ecb..2b6a4f58895 100644 --- a/cpp/src/binaryop/compiled/util.cpp +++ b/cpp/src/binaryop/compiled/util.cpp @@ -123,7 +123,7 @@ struct is_supported_operation_functor { template struct nested_support_functor { template - inline constexpr bool call(data_type out_type) const + [[nodiscard]] inline constexpr bool call(data_type out_type) const { return is_binary_operation_supported{}.template operator()( out_type); @@ -163,7 +163,7 @@ struct is_supported_operation_functor { }; template - inline constexpr bool bool_op(data_type out) const + [[nodiscard]] inline constexpr bool bool_op(data_type out) const { return out.id() == type_id::BOOL8 and is_binary_operation_supported{}.template operator()(); diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index b0208a58896..819ad593c0a 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -181,7 +181,7 @@ class metadata_builder_impl { col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children); } - std::vector build() const + [[nodiscard]] std::vector build() const { auto output = std::vector(metadata.size() * sizeof(detail::serialized_column)); std::memcpy(output.data(), metadata.data(), output.size()); diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index a3471485293..1b0d201501b 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -221,7 +221,7 @@ class posix_parser { /** * @brief Returns the remaining number of characters in the input. */ - auto remaining_char_cnt() const { return end - cur; } + [[nodiscard]] auto remaining_char_cnt() const { return end - cur; } /** * @brief Returns the next character in the input. diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index 05beecfbf9b..dd9e9600a87 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -23,7 +23,7 @@ namespace cudf { namespace detail { -data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) +data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view) { switch (arrow_view->type) { case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY); diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index defddb4dc42..4e2628ab689 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -37,7 +37,7 @@ static constexpr int fixed_width_data_buffer_idx = 1; * @param arrow_view SchemaView to pull the logical and storage types from * @return Column type id */ -data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view); +data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view); /** * @brief Map cudf column type id to ArrowType id diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp index 3e6a337457a..2a19a5360fe 100644 --- a/cpp/src/interop/detail/arrow_allocator.cpp +++ b/cpp/src/interop/detail/arrow_allocator.cpp @@ -38,7 +38,7 @@ T enable_hugepage(T&& buf) } #ifdef MADV_HUGEPAGE - const auto pagesize = sysconf(_SC_PAGESIZE); + auto const pagesize = sysconf(_SC_PAGESIZE); void* addr = const_cast(buf->data()); if (addr == nullptr) { return std::move(buf); } auto length{static_cast(buf->size())}; diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 36bb35d9419..854a1d68fdc 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -140,7 +140,7 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSch bool skip_mask) { auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; - const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); + auto const buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); auto data = rmm::device_buffer(buffer_length, stream, mr); CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), @@ -322,7 +322,7 @@ template <> std::unique_ptr dispatch_copy_from_arrow_host::operator()( ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) { - const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + void const* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; ArrowArray offsets_array = { .length = input->offset + input->length + 1, .null_count = 0, diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 221cdf93042..2041f03cd81 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -485,8 +485,8 @@ std::string schema_parser::get_str() char const* cur = start; while (cur < m_end && *cur++ != '"') ; - int32_t len = static_cast(cur - start - 1); - m_cur = cur; + auto len = static_cast(cur - start - 1); + m_cur = cur; return s.assign(start, std::max(len, 0)); } diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 3e5d966282d..ab516dd585d 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -305,7 +305,7 @@ std::vector decompress(compression_type compression, host_spannum_entries; i++) { - zip_cdfh_s const* cdfh = reinterpret_cast( + auto const* cdfh = reinterpret_cast( reinterpret_cast(za.cdfh) + cdfh_ofs); int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len; if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) { @@ -314,8 +314,8 @@ std::vector decompress(compression_type compression, host_spancomp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) { - size_t lfh_ofs = cdfh->hdr_ofs; - zip_lfh_s const* lfh = reinterpret_cast(raw + lfh_ofs); + size_t lfh_ofs = cdfh->hdr_ofs; + auto const* lfh = reinterpret_cast(raw + lfh_ofs); if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 && lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) { if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) { @@ -340,7 +340,7 @@ std::vector decompress(compression_type compression, host_span 4) { - bz2_file_header_s const* fhdr = reinterpret_cast(raw); + auto const* fhdr = reinterpret_cast(raw); // Check for BZIP2 file signature "BZh1" to "BZh9" if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' && fhdr->blksz >= '1' && fhdr->blksz <= '9') { diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 1ed8ee5ce06..5daa55d4552 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -306,14 +306,14 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info, // Get file-level statistics, statistics of each column of file for (auto const& stats : metadata.ff.statistics) { - result.file_stats.push_back(std::string(stats.cbegin(), stats.cend())); + result.file_stats.emplace_back(stats.cbegin(), stats.cend()); } // Get stripe-level statistics for (auto const& stripes_stats : metadata.md.stripeStats) { result.stripes_stats.emplace_back(); for (auto const& stats : stripes_stats.colStats) { - result.stripes_stats.back().push_back(std::string(stats.cbegin(), stats.cend())); + result.stripes_stats.back().emplace_back(stats.cbegin(), stats.cend()); } } @@ -1026,8 +1026,8 @@ parquet_writer_options_builder& parquet_writer_options_builder::column_chunks_fi return *this; } -chunked_parquet_writer_options::chunked_parquet_writer_options(sink_info const& sink) - : parquet_writer_options_base(sink) +chunked_parquet_writer_options::chunked_parquet_writer_options(sink_info sink) + : parquet_writer_options_base(std::move(sink)) { } diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b243e4ba006..031edfde4f6 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -245,7 +245,7 @@ struct TransduceToken { RelativeOffsetT const relative_offset, SymbolT const read_symbol) const { - const bool is_end_of_invalid_line = + bool const is_end_of_invalid_line = (state_id == static_cast(TT_INV) && match_id == static_cast(dfa_symbol_group_id::DELIMITER)); @@ -265,15 +265,15 @@ struct TransduceToken { // Number of tokens emitted on invalid lines constexpr int32_t num_inv_tokens = 2; - const bool is_delimiter = match_id == static_cast(dfa_symbol_group_id::DELIMITER); + bool const is_delimiter = match_id == static_cast(dfa_symbol_group_id::DELIMITER); // If state is either invalid or we're entering an invalid state, we discard tokens - const bool is_part_of_invalid_line = + bool const is_part_of_invalid_line = (match_id != static_cast(dfa_symbol_group_id::ERROR) && state_id == static_cast(TT_VLD)); // Indicates whether we transition from an invalid line to a potentially valid line - const bool is_end_of_invalid_line = (state_id == static_cast(TT_INV) && is_delimiter); + bool const is_end_of_invalid_line = (state_id == static_cast(TT_INV) && is_delimiter); int32_t const emit_count = is_end_of_invalid_line ? num_inv_tokens : (is_part_of_invalid_line && !is_delimiter ? 1 : 0); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index df5c7bc21e1..e999be8f83a 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -85,7 +85,7 @@ device_span ingest_raw_input(device_span buffer, sources.end(), prefsum_source_sizes.begin(), std::plus{}, - [](const std::unique_ptr& s) { return s->size(); }); + [](std::unique_ptr const& s) { return s->size(); }); auto upper = std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset); size_t start_source = std::distance(prefsum_source_sizes.begin(), upper); diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index fd55cbb6846..e1403acd455 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -511,7 +511,7 @@ class ProtobufWriter { TypeKind kind, ColStatsBlob const* stats); - std::size_t size() const { return m_buff.size(); } + [[nodiscard]] std::size_t size() const { return m_buff.size(); } uint8_t const* data() { return m_buff.data(); } std::vector& buffer() { return m_buff; } diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp index 4862562d526..731e9d7687e 100644 --- a/cpp/src/io/orc/orc_field_writer.hpp +++ b/cpp/src/io/orc/orc_field_writer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,10 +31,10 @@ namespace io { namespace orc { struct ProtobufWriter::ProtobufFieldWriter { - int struct_size; + int struct_size{0}; ProtobufWriter* p; - ProtobufFieldWriter(ProtobufWriter* pbw) : struct_size(0), p(pbw) {} + ProtobufFieldWriter(ProtobufWriter* pbw) : p(pbw) {} /** * @brief Function to write a unsigned integer to the internal buffer diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 43301826003..01ee5ad177d 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -537,7 +537,7 @@ void reader_impl::load_next_stripe_data(read_mode mode) _file_itm_data.selected_stripes.begin() + stripe_start, _file_itm_data.selected_stripes.begin() + stripe_start + stripe_count, std::size_t{0}, - [](std::size_t count, const auto& stripe) { return count + stripe.stripe_info->numberOfRows; }); + [](std::size_t count, auto const& stripe) { return count + stripe.stripe_info->numberOfRows; }); // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`. _chunk_read_data.curr_decode_stripe_range = 0; diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index da9fb802a0a..72eb41b1360 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -810,7 +810,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) cudf::detail::hostdevice_2dvector(stripe_count, num_lvl_columns, _stream); memset(chunks.base_host_ptr(), 0, chunks.size_bytes()); - const bool use_index = + bool const use_index = _options.use_index && // Do stripes have row group index _metadata.is_row_grp_idx_present() && diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index c9212334a96..192833507b0 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -42,7 +42,7 @@ class parquet_field { public: virtual ~parquet_field() = default; - int field() const { return _field_val; } + [[nodiscard]] int field() const { return _field_val; } }; std::string field_type_string(FieldType type) diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index c2e6178acbf..d4778b1ea15 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -64,11 +64,11 @@ class CompactProtocolWriter { class CompactProtocolFieldWriter { CompactProtocolWriter& writer; size_t struct_start_pos; - int current_field_value; + int current_field_value{0}; public: CompactProtocolFieldWriter(CompactProtocolWriter& caller) - : writer(caller), struct_start_pos(writer.m_buf.size()), current_field_value(0) + : writer(caller), struct_start_pos(writer.m_buf.size()) { } diff --git a/cpp/src/io/parquet/ipc/Schema_generated.h b/cpp/src/io/parquet/ipc/Schema_generated.h index 27141b4af31..c091204417a 100644 --- a/cpp/src/io/parquet/ipc/Schema_generated.h +++ b/cpp/src/io/parquet/ipc/Schema_generated.h @@ -139,13 +139,13 @@ inline const MetadataVersion (&EnumValuesMetadataVersion())[5] return values; } -inline const char* const* EnumNamesMetadataVersion() +inline char const* const* EnumNamesMetadataVersion() { - static const char* const names[6] = {"V1", "V2", "V3", "V4", "V5", nullptr}; + static char const* const names[6] = {"V1", "V2", "V3", "V4", "V5", nullptr}; return names; } -inline const char* EnumNameMetadataVersion(MetadataVersion e) +inline char const* EnumNameMetadataVersion(MetadataVersion e) { if (::flatbuffers::IsOutRange(e, MetadataVersion_V1, MetadataVersion_V5)) return ""; const size_t index = static_cast(e); @@ -190,14 +190,14 @@ inline const Feature (&EnumValuesFeature())[3] return values; } -inline const char* const* EnumNamesFeature() +inline char const* const* EnumNamesFeature() { - static const char* const names[4] = { + static char const* const names[4] = { "UNUSED", "DICTIONARY_REPLACEMENT", "COMPRESSED_BODY", nullptr}; return names; } -inline const char* EnumNameFeature(Feature e) +inline char const* EnumNameFeature(Feature e) { if (::flatbuffers::IsOutRange(e, Feature_UNUSED, Feature_COMPRESSED_BODY)) return ""; const size_t index = static_cast(e); @@ -217,13 +217,13 @@ inline const UnionMode (&EnumValuesUnionMode())[2] return values; } -inline const char* const* EnumNamesUnionMode() +inline char const* const* EnumNamesUnionMode() { - static const char* const names[3] = {"Sparse", "Dense", nullptr}; + static char const* const names[3] = {"Sparse", "Dense", nullptr}; return names; } -inline const char* EnumNameUnionMode(UnionMode e) +inline char const* EnumNameUnionMode(UnionMode e) { if (::flatbuffers::IsOutRange(e, UnionMode_Sparse, UnionMode_Dense)) return ""; const size_t index = static_cast(e); @@ -244,13 +244,13 @@ inline const Precision (&EnumValuesPrecision())[3] return values; } -inline const char* const* EnumNamesPrecision() +inline char const* const* EnumNamesPrecision() { - static const char* const names[4] = {"HALF", "SINGLE", "DOUBLE", nullptr}; + static char const* const names[4] = {"HALF", "SINGLE", "DOUBLE", nullptr}; return names; } -inline const char* EnumNamePrecision(Precision e) +inline char const* EnumNamePrecision(Precision e) { if (::flatbuffers::IsOutRange(e, Precision_HALF, Precision_DOUBLE)) return ""; const size_t index = static_cast(e); @@ -270,13 +270,13 @@ inline const DateUnit (&EnumValuesDateUnit())[2] return values; } -inline const char* const* EnumNamesDateUnit() +inline char const* const* EnumNamesDateUnit() { - static const char* const names[3] = {"DAY", "MILLISECOND", nullptr}; + static char const* const names[3] = {"DAY", "MILLISECOND", nullptr}; return names; } -inline const char* EnumNameDateUnit(DateUnit e) +inline char const* EnumNameDateUnit(DateUnit e) { if (::flatbuffers::IsOutRange(e, DateUnit_DAY, DateUnit_MILLISECOND)) return ""; const size_t index = static_cast(e); @@ -299,14 +299,14 @@ inline const TimeUnit (&EnumValuesTimeUnit())[4] return values; } -inline const char* const* EnumNamesTimeUnit() +inline char const* const* EnumNamesTimeUnit() { - static const char* const names[5] = { + static char const* const names[5] = { "SECOND", "MILLISECOND", "MICROSECOND", "NANOSECOND", nullptr}; return names; } -inline const char* EnumNameTimeUnit(TimeUnit e) +inline char const* EnumNameTimeUnit(TimeUnit e) { if (::flatbuffers::IsOutRange(e, TimeUnit_SECOND, TimeUnit_NANOSECOND)) return ""; const size_t index = static_cast(e); @@ -328,13 +328,13 @@ inline const IntervalUnit (&EnumValuesIntervalUnit())[3] return values; } -inline const char* const* EnumNamesIntervalUnit() +inline char const* const* EnumNamesIntervalUnit() { - static const char* const names[4] = {"YEAR_MONTH", "DAY_TIME", "MONTH_DAY_NANO", nullptr}; + static char const* const names[4] = {"YEAR_MONTH", "DAY_TIME", "MONTH_DAY_NANO", nullptr}; return names; } -inline const char* EnumNameIntervalUnit(IntervalUnit e) +inline char const* EnumNameIntervalUnit(IntervalUnit e) { if (::flatbuffers::IsOutRange(e, IntervalUnit_YEAR_MONTH, IntervalUnit_MONTH_DAY_NANO)) return ""; const size_t index = static_cast(e); @@ -389,9 +389,9 @@ inline const Type (&EnumValuesType())[27] return values; } -inline const char* const* EnumNamesType() +inline char const* const* EnumNamesType() { - static const char* const names[28] = { + static char const* const names[28] = { "NONE", "Null", "Int", "FloatingPoint", "Binary", "Utf8", "Bool", "Decimal", "Date", "Time", "Timestamp", "Interval", @@ -402,7 +402,7 @@ inline const char* const* EnumNamesType() return names; } -inline const char* EnumNameType(Type e) +inline char const* EnumNameType(Type e) { if (::flatbuffers::IsOutRange(e, Type_NONE, Type_LargeListView)) return ""; const size_t index = static_cast(e); @@ -544,10 +544,10 @@ struct TypeTraits { static const Type enum_value = Type_LargeListView; }; -bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type); +bool VerifyType(::flatbuffers::Verifier& verifier, void const* obj, Type type); bool VerifyTypeVector(::flatbuffers::Verifier& verifier, - const ::flatbuffers::Vector<::flatbuffers::Offset>* values, - const ::flatbuffers::Vector* types); + ::flatbuffers::Vector<::flatbuffers::Offset> const* values, + ::flatbuffers::Vector const* types); /// ---------------------------------------------------------------------- /// Dictionary encoding metadata @@ -566,13 +566,13 @@ inline const DictionaryKind (&EnumValuesDictionaryKind())[1] return values; } -inline const char* const* EnumNamesDictionaryKind() +inline char const* const* EnumNamesDictionaryKind() { - static const char* const names[2] = {"DenseArray", nullptr}; + static char const* const names[2] = {"DenseArray", nullptr}; return names; } -inline const char* EnumNameDictionaryKind(DictionaryKind e) +inline char const* EnumNameDictionaryKind(DictionaryKind e) { if (::flatbuffers::IsOutRange(e, DictionaryKind_DenseArray, DictionaryKind_DenseArray)) return ""; const size_t index = static_cast(e); @@ -594,13 +594,13 @@ inline const Endianness (&EnumValuesEndianness())[2] return values; } -inline const char* const* EnumNamesEndianness() +inline char const* const* EnumNamesEndianness() { - static const char* const names[3] = {"Little", "Big", nullptr}; + static char const* const names[3] = {"Little", "Big", nullptr}; return names; } -inline const char* EnumNameEndianness(Endianness e) +inline char const* EnumNameEndianness(Endianness e) { if (::flatbuffers::IsOutRange(e, Endianness_Little, Endianness_Big)) return ""; const size_t index = static_cast(e); @@ -652,7 +652,7 @@ struct NullBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -685,7 +685,7 @@ struct Struct_Builder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -715,7 +715,7 @@ struct ListBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -747,7 +747,7 @@ struct LargeListBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -780,7 +780,7 @@ struct ListViewBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -812,7 +812,7 @@ struct LargeListViewBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -851,7 +851,7 @@ struct FixedSizeListBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -916,7 +916,7 @@ struct MapBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -941,9 +941,9 @@ struct Union FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { { return static_cast(GetField(VT_MODE, 0)); } - const ::flatbuffers::Vector* typeIds() const + ::flatbuffers::Vector const* typeIds() const { - return GetPointer*>(VT_TYPEIDS); + return GetPointer<::flatbuffers::Vector const*>(VT_TYPEIDS); } bool Verify(::flatbuffers::Verifier& verifier) const { @@ -971,7 +971,7 @@ struct UnionBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -991,7 +991,7 @@ inline ::flatbuffers::Offset CreateUnion( inline ::flatbuffers::Offset CreateUnionDirect( ::flatbuffers::FlatBufferBuilder& _fbb, cudf::io::parquet::flatbuf::UnionMode mode = cudf::io::parquet::flatbuf::UnionMode_Sparse, - const std::vector* typeIds = nullptr) + std::vector const* typeIds = nullptr) { auto typeIds__ = typeIds ? _fbb.CreateVector(*typeIds) : 0; return cudf::io::parquet::flatbuf::CreateUnion(_fbb, mode, typeIds__); @@ -1027,7 +1027,7 @@ struct IntBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1071,7 +1071,7 @@ struct FloatingPointBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1105,7 +1105,7 @@ struct Utf8Builder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1136,7 +1136,7 @@ struct BinaryBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1168,7 +1168,7 @@ struct LargeUtf8Builder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1200,7 +1200,7 @@ struct LargeBinaryBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1237,7 +1237,7 @@ struct Utf8ViewBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1274,7 +1274,7 @@ struct BinaryViewBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1312,7 +1312,7 @@ struct FixedSizeBinaryBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1344,7 +1344,7 @@ struct BoolBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1379,7 +1379,7 @@ struct RunEndEncodedBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1437,7 +1437,7 @@ struct DecimalBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1489,7 +1489,7 @@ struct DateBuilder { } ::flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); + auto const end = fbb_.EndTable(start_); auto o = ::flatbuffers::Offset(end); return o; } @@ -1548,7 +1548,7 @@ struct TimeBuilder { } ::flatbuffers::Offset
groupby(table_view const& keys, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const num_keys = keys.num_rows(); + // convert to int64_t to avoid potential overflow with large `keys` + auto const num_keys = static_cast(keys.num_rows()); auto const null_keys_are_equal = null_equality::EQUAL; auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index dc6eb55fc6a..050bcbb268f 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -7838,11 +7838,12 @@ void testSumWithStrings() { .build(); Table result = t.groupBy(0).aggregate( GroupByAggregation.sum().onColumn(1)); + Table sorted = result.orderBy(OrderByArg.asc(0)); Table expected = new Table.TestBuilder() .column("1-URGENT", "3-MEDIUM") .column(5289L + 5303L, 5203L + 5206L) .build()) { - assertTablesAreEqual(expected, result); + assertTablesAreEqual(expected, sorted); } } diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d08268eea3a..77b54a583d3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1308,7 +1308,7 @@ def pipe(self, func, *args, **kwargs): To get the difference between each groups maximum and minimum value in one pass, you can do - >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + >>> df.groupby('A', sort=True).pipe(lambda x: x.max() - x.min()) B A a 2 From fc4b3d3ecbf95ee9afdcd509554bbeb5367a3059 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:02:05 -1000 Subject: [PATCH 127/340] Reduce deep copies in Index ops (#16054) 1. Changed `Index.rename(inplace=False)` to shallow copy which matches pandas behavior. Let me know if there's a reason why we should deep copy here. 2. Made `RangeIndex.unique` return a shallow copy like pandas. 3. Made `Index.dropna` with no NA's shallow copy like pandas. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16054 --- python/cudf/cudf/core/_base_index.py | 6 +++--- python/cudf/cudf/core/index.py | 5 +++-- python/cudf/cudf/tests/test_index.py | 25 +++++++++++++++++++++++-- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ad73cd57f7d..caf07b286cd 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1120,7 +1120,7 @@ def difference(self, other, sort=None): res_name = _get_result_name(self.name, other.name) if is_mixed_with_object_dtype(self, other) or len(other) == 0: - difference = self.copy().unique() + difference = self.unique() difference.name = res_name if sort is True: return difference.sort_values() @@ -1744,7 +1744,7 @@ def rename(self, name, inplace=False): self.name = name return None else: - out = self.copy(deep=True) + out = self.copy(deep=False) out.name = name return out @@ -2068,7 +2068,7 @@ def dropna(self, how="any"): raise ValueError(f"{how=} must be 'any' or 'all'") try: if not self.hasnans: - return self.copy() + return self.copy(deep=False) except NotImplementedError: pass # This is to be consistent with IndexedFrame.dropna to handle nans diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1c5d05d2d87..71658695b80 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -528,7 +528,7 @@ def memory_usage(self, deep: bool = False) -> int: def unique(self) -> Self: # RangeIndex always has unique values - return self + return self.copy() @_cudf_nvtx_annotate def __mul__(self, other): @@ -3197,7 +3197,8 @@ def _get_nearest_indexer( ) right_indexer = _get_indexer_basic( index=index, - positions=positions.copy(deep=True), + # positions no longer used so don't copy + positions=positions, method="backfill", target_col=target_col, tolerance=tolerance, diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3d6c71ebc1b..a59836df5ba 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -252,10 +252,10 @@ def test_index_rename_inplace(): pds = pd.Index([1, 2, 3], name="asdf") gds = Index(pds) - # inplace=False should yield a deep copy + # inplace=False should yield a shallow copy gds_renamed_deep = gds.rename("new_name", inplace=False) - assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr + assert gds_renamed_deep._values.data_ptr == gds._values.data_ptr # inplace=True returns none expected_ptr = gds._values.data_ptr @@ -3214,6 +3214,27 @@ def test_rangeindex_dropna(): assert_eq(result, expected) +def test_rangeindex_unique_shallow_copy(): + ri_pandas = pd.RangeIndex(1) + result = ri_pandas.unique() + assert result is not ri_pandas + + ri_cudf = cudf.RangeIndex(1) + result = ri_cudf.unique() + assert result is not ri_cudf + assert_eq(result, ri_cudf) + + +def test_rename_shallow_copy(): + idx = pd.Index([1]) + result = idx.rename("a") + assert idx.to_numpy(copy=False) is result.to_numpy(copy=False) + + idx = cudf.Index([1]) + result = idx.rename("a") + assert idx._column is result._column + + @pytest.mark.parametrize("data", [range(2), [10, 11, 12]]) def test_index_contains_hashable(data): gidx = cudf.Index(data) From 2ddbe2a0665066fe8a5021b23c9268ce91ce67a2 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 18 Jun 2024 20:06:04 +0100 Subject: [PATCH 128/340] Test behaviour of containers (#15994) This ensures we cover all implementation. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15994 --- .../cudf_polars/containers/column.py | 2 +- .../cudf_polars/tests/containers/__init__.py | 6 ++ .../tests/containers/test_column.py | 70 ++++++++++++++ .../tests/containers/test_dataframe.py | 92 +++++++++++++++++++ 4 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 python/cudf_polars/tests/containers/__init__.py create mode 100644 python/cudf_polars/tests/containers/test_column.py create mode 100644 python/cudf_polars/tests/containers/test_dataframe.py diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 156dd395d64..28685f0c4ed 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -130,7 +130,7 @@ def copy(self) -> Self: def mask_nans(self) -> Self: """Return a copy of self with nans masked out.""" if self.nan_count > 0: - raise NotImplementedError + raise NotImplementedError("Need to port transform.hpp to pylibcudf") return self.copy() @functools.cached_property diff --git a/python/cudf_polars/tests/containers/__init__.py b/python/cudf_polars/tests/containers/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/containers/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py new file mode 100644 index 00000000000..3291d8db161 --- /dev/null +++ b/python/cudf_polars/tests/containers/test_column.py @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pyarrow +import pytest + +import cudf._lib.pylibcudf as plc + +from cudf_polars.containers import Column + + +def test_non_scalar_access_raises(): + column = Column( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID + ) + ) + with pytest.raises(ValueError): + _ = column.obj_scalar + + +@pytest.mark.parametrize("length", [0, 1]) +def test_length_leq_one_always_sorted(length): + column = Column( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), length, plc.MaskState.ALL_VALID + ) + ) + assert column.is_sorted == plc.types.Sorted.YES + column.set_sorted( + is_sorted=plc.types.Sorted.NO, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.AFTER, + ) + assert column.is_sorted == plc.types.Sorted.YES + + +def test_shallow_copy(): + column = Column( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID + ) + ) + copy = column.copy() + copy = copy.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.AFTER, + ) + assert column.is_sorted == plc.types.Sorted.NO + assert copy.is_sorted == plc.types.Sorted.YES + + +@pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32]) +def test_mask_nans(typeid): + dtype = plc.DataType(typeid) + values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype)) + column = Column(plc.interop.from_arrow(values)) + masked = column.mask_nans() + assert column.obj is masked.obj + + +def test_mask_nans_float_with_nan_notimplemented(): + dtype = plc.DataType(plc.TypeId.FLOAT32) + values = pyarrow.array([0, 0, float("nan")], type=plc.interop.to_arrow(dtype)) + column = Column(plc.interop.from_arrow(values)) + with pytest.raises(NotImplementedError): + _ = column.mask_nans() diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py new file mode 100644 index 00000000000..2e385e39eef --- /dev/null +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import cudf._lib.pylibcudf as plc + +from cudf_polars.containers import DataFrame, NamedColumn + + +def test_select_missing_raises(): + df = DataFrame( + [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID + ), + "a", + ) + ] + ) + with pytest.raises(ValueError): + df.select(["b", "a"]) + + +def test_replace_missing_raises(): + df = DataFrame( + [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID + ), + "a", + ) + ] + ) + replacement = df.columns[0].copy(new_name="b") + with pytest.raises(ValueError): + df.replace_columns(replacement) + + +def test_from_table_wrong_names(): + table = plc.Table( + [ + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID + ) + ] + ) + with pytest.raises(ValueError): + DataFrame.from_table(table, ["a", "b"]) + + +def test_sorted_like_raises_mismatching_names(): + df = DataFrame( + [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID + ), + "a", + ) + ] + ) + like = df.copy().rename_columns({"a": "b"}) + with pytest.raises(ValueError): + df.sorted_like(like) + + +def test_shallow_copy(): + column = NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID + ), + "a", + ) + column.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.AFTER, + ) + df = DataFrame([column]) + copy = df.copy() + copy.columns[0].set_sorted( + is_sorted=plc.types.Sorted.NO, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.AFTER, + ) + assert df.columns[0].is_sorted == plc.types.Sorted.YES + assert copy.columns[0].is_sorted == plc.types.Sorted.NO From 9bc794aa355c8e4c42fbc611fe9d496c20a4db90 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 18 Jun 2024 20:06:45 +0100 Subject: [PATCH 129/340] Coverage of binops where one or both operands are a scalar (#15998) Just needed the tests here. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15998 --- .../tests/expressions/test_numeric_binops.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 7eefc59d927..b6bcd0026fa 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -99,3 +99,15 @@ def test_numeric_binop(df, binop): q = df.select(binop(left, right)) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("left_scalar", [False, True]) +@pytest.mark.parametrize("right_scalar", [False, True]) +def test_binop_with_scalar(left_scalar, right_scalar): + df = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]}) + + lop = pl.lit(2) if left_scalar else pl.col("a") + rop = pl.lit(6) if right_scalar else pl.col("b") + q = df.select(lop / rop) + + assert_gpu_result_equal(q) From c83e5b3fdd7f9fe8a08c4f6874fbf847bba70c53 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 18 Jun 2024 16:22:44 -0400 Subject: [PATCH 130/340] Fix JSON multi-source reading when total source size exceeds `INT_MAX` bytes (#15930) Fixes #15917. - [X] Batched read and parse operations - [x] Fail when any single source file exceeds `INT_MAX` bytes. This case will be handled with a chunked reader later. Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15930 --- cpp/include/cudf/io/types.hpp | 13 +++ cpp/src/io/json/read_json.cu | 121 +++++++++++++++++++++---- cpp/tests/CMakeLists.txt | 1 + cpp/tests/large_strings/json_tests.cpp | 58 ++++++++++++ 4 files changed, 177 insertions(+), 16 deletions(-) create mode 100644 cpp/tests/large_strings/json_tests.cpp diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 0dab1c606de..0c96268f6c7 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -256,6 +256,19 @@ struct column_name_info { } column_name_info() = default; + + /** + * @brief Compares two column name info structs for equality + * + * @param rhs column name info struct to compare against + * @return boolean indicating if this and rhs are equal + */ + bool operator==(column_name_info const& rhs) const + { + return ((name == rhs.name) && (is_nullable == rhs.is_nullable) && + (is_binary == rhs.is_binary) && (type_length == rhs.type_length) && + (children == rhs.children)); + }; }; /** diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index e999be8f83a..74001e5e01a 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -18,7 +18,9 @@ #include "io/json/nested_json.hpp" #include "read_json.hpp" +#include #include +#include #include #include #include @@ -76,7 +78,7 @@ device_span ingest_raw_input(device_span buffer, auto constexpr num_delimiter_chars = 1; if (compression == compression_type::NONE) { - std::vector delimiter_map{}; + std::vector delimiter_map{}; std::vector prefsum_source_sizes(sources.size()); std::vector> h_buffers; delimiter_map.reserve(sources.size()); @@ -84,7 +86,7 @@ device_span ingest_raw_input(device_span buffer, std::transform_inclusive_scan(sources.begin(), sources.end(), prefsum_source_sizes.begin(), - std::plus{}, + std::plus{}, [](std::unique_ptr const& s) { return s->size(); }); auto upper = std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset); @@ -259,6 +261,33 @@ datasource::owning_buffer> get_record_range_raw_input( readbufspan.size() - first_delim_pos - shift_for_nonzero_offset); } +table_with_metadata read_batch(host_span> sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + datasource::owning_buffer> bufview = + get_record_range_raw_input(sources, reader_opts, stream); + + // If input JSON buffer has single quotes and option to normalize single quotes is enabled, + // invoke pre-processing FST + if (reader_opts.is_enabled_normalize_single_quotes()) { + normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource()); + } + + // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is + // enabled, invoke pre-processing FST + if (reader_opts.is_enabled_normalize_whitespace()) { + normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource()); + } + + auto buffer = + cudf::device_span(reinterpret_cast(bufview.data()), bufview.size()); + stream.synchronize(); + return device_parse_nested_json(buffer, reader_opts, stream, mr); +} + table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, @@ -278,25 +307,85 @@ table_with_metadata read_json(host_span> sources, "Multiple inputs are supported only for JSON Lines format"); } - datasource::owning_buffer> bufview = - get_record_range_raw_input(sources, reader_opts, stream); + std::for_each(sources.begin(), sources.end(), [](auto const& source) { + CUDF_EXPECTS(source->size() < std::numeric_limits::max(), + "The size of each source file must be less than INT_MAX bytes"); + }); - // If input JSON buffer has single quotes and option to normalize single quotes is enabled, - // invoke pre-processing FST - if (reader_opts.is_enabled_normalize_single_quotes()) { - normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource()); + constexpr size_t batch_size_ub = std::numeric_limits::max(); + size_t const chunk_offset = reader_opts.get_byte_range_offset(); + size_t chunk_size = reader_opts.get_byte_range_size(); + chunk_size = !chunk_size ? sources_size(sources, 0, 0) : chunk_size; + + // Identify the position of starting source file from which to begin batching based on + // byte range offset. If the offset is larger than the sum of all source + // sizes, then start_source is total number of source files i.e. no file is read + size_t const start_source = [&]() { + size_t sum = 0; + for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) { + if (sum + sources[src_idx]->size() > chunk_offset) return src_idx; + sum += sources[src_idx]->size(); + } + return sources.size(); + }(); + + // Construct batches of source files, with starting position of batches indicated by + // batch_positions. The size of each batch i.e. the sum of sizes of the source files in the batch + // is capped at INT_MAX bytes. + size_t cur_size = 0; + std::vector batch_positions; + std::vector batch_sizes; + batch_positions.push_back(0); + for (size_t i = start_source; i < sources.size(); i++) { + cur_size += sources[i]->size(); + if (cur_size >= batch_size_ub) { + batch_positions.push_back(i); + batch_sizes.push_back(cur_size - sources[i]->size()); + cur_size = sources[i]->size(); + } } + batch_positions.push_back(sources.size()); + batch_sizes.push_back(cur_size); - // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is - // enabled, invoke pre-processing FST - if (reader_opts.is_enabled_normalize_whitespace()) { - normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource()); + // If there is a single batch, then we can directly return the table without the + // unnecessary concatenate + if (batch_sizes.size() == 1) return read_batch(sources, reader_opts, stream, mr); + + std::vector partial_tables; + json_reader_options batched_reader_opts{reader_opts}; + + // Dispatch individual batches to read_batch and push the resulting table into + // partial_tables array. Note that the reader options need to be updated for each + // batch to adjust byte range offset and byte range size. + for (size_t i = 0; i < batch_sizes.size(); i++) { + batched_reader_opts.set_byte_range_size(std::min(batch_sizes[i], chunk_size)); + partial_tables.emplace_back(read_batch( + host_span>(sources.begin() + batch_positions[i], + batch_positions[i + 1] - batch_positions[i]), + batched_reader_opts, + stream, + rmm::mr::get_current_device_resource())); + if (chunk_size <= batch_sizes[i]) break; + chunk_size -= batch_sizes[i]; + batched_reader_opts.set_byte_range_offset(0); } - auto buffer = - cudf::device_span(reinterpret_cast(bufview.data()), bufview.size()); - stream.synchronize(); - return device_parse_nested_json(buffer, reader_opts, stream, mr); + auto expects_schema_equality = + std::all_of(partial_tables.begin() + 1, + partial_tables.end(), + [> = partial_tables[0].metadata.schema_info](auto& ptbl) { + return ptbl.metadata.schema_info == gt; + }); + CUDF_EXPECTS(expects_schema_equality, + "Mismatch in JSON schema across batches in multi-source multi-batch reading"); + + auto partial_table_views = std::vector(partial_tables.size()); + std::transform(partial_tables.begin(), + partial_tables.end(), + partial_table_views.begin(), + [](auto const& table) { return table.tbl->view(); }); + return table_with_metadata{cudf::concatenate(partial_table_views, stream, mr), + {partial_tables[0].metadata.schema_info}}; } } // namespace cudf::io::json::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 329edbe4d36..eda470d2309 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -570,6 +570,7 @@ ConfigureTest( LARGE_STRINGS_TEST large_strings/concatenate_tests.cpp large_strings/case_tests.cpp + large_strings/json_tests.cpp large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp large_strings/parquet_tests.cpp diff --git a/cpp/tests/large_strings/json_tests.cpp b/cpp/tests/large_strings/json_tests.cpp new file mode 100644 index 00000000000..bf16d131ba7 --- /dev/null +++ b/cpp/tests/large_strings/json_tests.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include +#include + +struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {}; + +TEST_F(JsonLargeReaderTest, MultiBatch) +{ + std::string json_string = R"( + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } + { "a": { "y" : 6}, "b" : [6 ], "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14 })"; + constexpr size_t expected_file_size = std::numeric_limits::max() / 2; + std::size_t const log_repetitions = + static_cast(std::ceil(std::log2(expected_file_size / json_string.size()))); + + json_string.reserve(json_string.size() * (1UL << log_repetitions)); + std::size_t numrows = 4; + for (std::size_t i = 0; i < log_repetitions; i++) { + json_string += json_string; + numrows <<= 1; + } + + constexpr int num_sources = 2; + std::vector> hostbufs( + num_sources, cudf::host_span(json_string.data(), json_string.size())); + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{ + cudf::host_span>(hostbufs.data(), hostbufs.size())}) + .lines(true) + .compression(cudf::io::compression_type::NONE) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); + + // Read full test data via existing, nested JSON lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + ASSERT_EQ(current_reader_table.tbl->num_rows(), numrows * num_sources); +} From f536e3017205be8b09f3dc2cfd448dc9c5a94d5d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 19 Jun 2024 16:50:48 +0100 Subject: [PATCH 131/340] Add basic tests of dataframe scan (#16003) Also assert that unsupported file scan operations raise. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/16003 --- python/cudf_polars/cudf_polars/dsl/ir.py | 4 +- .../cudf_polars/testing/asserts.py | 34 ++++++++++++++- python/cudf_polars/docs/overview.md | 18 ++++++++ .../cudf_polars/tests/test_dataframescan.py | 43 +++++++++++++++++++ python/cudf_polars/tests/test_scan.py | 13 +++++- python/cudf_polars/tests/testing/__init__.py | 6 +++ .../cudf_polars/tests/testing/test_asserts.py | 35 +++++++++++++++ 7 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 python/cudf_polars/tests/test_dataframescan.py create mode 100644 python/cudf_polars/tests/testing/__init__.py create mode 100644 python/cudf_polars/tests/testing/test_asserts.py diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 83957e4286d..3ccefac6b0a 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -196,7 +196,9 @@ def __post_init__(self) -> None: if self.file_options.n_rows is not None: raise NotImplementedError("row limit in scan") if self.typ not in ("csv", "parquet"): - raise NotImplementedError(f"Unhandled scan type: {self.typ}") + raise NotImplementedError( + f"Unhandled scan type: {self.typ}" + ) # pragma: no cover; polars raises on the rust side for now def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 3edaa427432..a9a4ae5f0a6 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -11,6 +11,7 @@ from polars.testing.asserts import assert_frame_equal from cudf_polars.callback import execute_with_cudf +from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: from collections.abc import Mapping @@ -19,7 +20,7 @@ from cudf_polars.typing import OptimizationArgs -__all__: list[str] = ["assert_gpu_result_equal"] +__all__: list[str] = ["assert_gpu_result_equal", "assert_ir_translation_raises"] def assert_gpu_result_equal( @@ -84,3 +85,34 @@ def assert_gpu_result_equal( atol=atol, categorical_as_str=categorical_as_str, ) + + +def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) -> None: + """ + Assert that translation of a query raises an exception. + + Parameters + ---------- + q + Query to translate. + exceptions + Exceptions that one expects might be raised. + + Returns + ------- + None + If translation successfully raised the specified exceptions. + + Raises + ------ + AssertionError + If the specified exceptions were not raised. + """ + try: + _ = translate_ir(q._ldf.visit()) + except exceptions: + return + except Exception as e: + raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e + else: + raise AssertionError(f"Translation DID NOT RAISE {exceptions}") diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index b50d01c26db..874bb849747 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -224,6 +224,24 @@ def test_whatever(): assert_gpu_result_equal(query) ``` +## Test coverage and asserting failure modes + +Where translation of a query should fail due to the feature being +unsupported we should test this. To assert that _translation_ raises +an exception (usually `NotImplementedError`), use the utility function +`assert_ir_translation_raises`: + +```python +from cudf_polars.testing.asserts import assert_ir_translation_raises + + +def test_whatever(): + unsupported_query = ... + assert_ir_translation_raises(unsupported_query, NotImplementedError) +``` + +This test will fail if translation does not raise. + # Debugging If the callback execution fails during the polars `collect` call, we diff --git a/python/cudf_polars/tests/test_dataframescan.py b/python/cudf_polars/tests/test_dataframescan.py new file mode 100644 index 00000000000..1ffe06ac562 --- /dev/null +++ b/python/cudf_polars/tests/test_dataframescan.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "subset", + [ + None, + ["a", "c"], + ["b", "c", "d"], + ["b", "d"], + ["b", "c"], + ["c", "e"], + ["d", "e"], + pl.selectors.string(), + pl.selectors.integer(), + ], +) +@pytest.mark.parametrize("predicate_pushdown", [False, True]) +def test_scan_drop_nulls(subset, predicate_pushdown): + df = pl.LazyFrame( + { + "a": [1, 2, 3, 4], + "b": [None, 4, 5, None], + "c": [6, 7, None, None], + "d": [8, None, 9, 10], + "e": [None, None, "A", None], + } + ) + # Drop nulls are pushed into filters + q = df.drop_nulls(subset) + + assert_gpu_result_equal( + q, collect_kwargs={"predicate_pushdown": predicate_pushdown} + ) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index b2443e357e2..f129cc7ca32 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -6,7 +6,10 @@ import polars as pl -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.fixture( @@ -86,3 +89,11 @@ def test_scan(df, columns, mask): if columns is not None: q = df.select(*columns) assert_gpu_result_equal(q) + + +def test_scan_unsupported_raises(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_ndjson(tmp_path / "df.json") + q = pl.scan_ndjson(tmp_path / "df.json") + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/testing/__init__.py b/python/cudf_polars/tests/testing/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/testing/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py new file mode 100644 index 00000000000..5bc2fe1efb7 --- /dev/null +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) + + +def test_translation_assert_raises(): + df = pl.LazyFrame({"a": [1, 2, 3]}) + + # This should succeed + assert_gpu_result_equal(df) + + with pytest.raises(AssertionError): + # This should fail, because we can translate this query. + assert_ir_translation_raises(df, NotImplementedError) + + class E(Exception): + pass + + unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b")) + # Unsupported query should raise NotImplementedError + assert_ir_translation_raises(unsupported, NotImplementedError) + + with pytest.raises(AssertionError): + # This should fail, because we can't translate this query, but it doesn't raise E. + assert_ir_translation_raises(unsupported, E) From ac3c8dddda2fac2cb02c8a8ee58d827c00ddf867 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 24 Jun 2024 08:09:36 -0400 Subject: [PATCH 132/340] Fix memory size in create_byte_range_infos_consecutive (#16012) Fixes over allocated memory for range vector in `cudf::io::text::create_byte_range_infos_consecutive` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/16012 --- cpp/src/io/text/byte_range_info.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp index 290e0451839..6a7836ed4e1 100644 --- a/cpp/src/io/text/byte_range_info.cpp +++ b/cpp/src/io/text/byte_range_info.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ std::vector create_byte_range_infos_consecutive(int64_t total_b auto range_size = util::div_rounding_up_safe(total_bytes, range_count); auto ranges = std::vector(); - ranges.reserve(range_size); + ranges.reserve(range_count); for (int64_t i = 0; i < range_count; i++) { auto offset = i * range_size; From ed41668eee28350183ceda29daf56c3ac7fa78ed Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 24 Jun 2024 07:57:22 -0700 Subject: [PATCH 133/340] Add test of interoperability of cuDF and arrow BYTE_STREAM_SPLIT encoders (#15832) BYTE_STREAM_SPLIT encoding was recently added to cuDF (#15311). The Parquet specification was recently changed (https://github.com/apache/parquet-format/pull/229) to extend the datatypes that can be encoded as BYTE_STREAM_SPLIT, and this was only recently implemented in arrow (https://github.com/apache/arrow/pull/40094). This PR adds a check that cuDF and arrow can produce compatible files using BYTE_STREAM_SPLIT encoding. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15832 --- python/cudf/cudf/tests/test_parquet.py | 55 ++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 2596fe8cd37..af79f361b43 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2947,6 +2947,61 @@ def test_per_column_options_string_col(tmpdir, encoding): assert encoding in fmd.row_group(0).column(0).encodings +@pytest.mark.parametrize( + "num_rows", + [200, 10000], +) +def test_parquet_bss_round_trip(tmpdir, num_rows): + def flba(i): + hasher = hashlib.sha256() + hasher.update(i.to_bytes(4, "little")) + return hasher.digest() + + # use pyarrow to write table of types that support BYTE_STREAM_SPLIT encoding + rows_per_rowgroup = 5000 + fixed_data = pa.array( + [flba(i) for i in range(num_rows)], type=pa.binary(32) + ) + i32_data = pa.array(list(range(num_rows)), type=pa.int32()) + i64_data = pa.array(list(range(num_rows)), type=pa.int64()) + f32_data = pa.array([float(i) for i in range(num_rows)], type=pa.float32()) + f64_data = pa.array([float(i) for i in range(num_rows)], type=pa.float64()) + padf = pa.Table.from_arrays( + [fixed_data, i32_data, i64_data, f32_data, f64_data], + names=["flba", "i32", "i64", "f32", "f64"], + ) + padf_fname = tmpdir.join("padf.parquet") + pq.write_table( + padf, + padf_fname, + column_encoding="BYTE_STREAM_SPLIT", + use_dictionary=False, + row_group_size=rows_per_rowgroup, + ) + + # round trip data with cudf + cdf = cudf.read_parquet(padf_fname) + cdf_fname = tmpdir.join("cdf.parquet") + cdf.to_parquet( + cdf_fname, + column_type_length={"flba": 32}, + column_encoding={ + "flba": "BYTE_STREAM_SPLIT", + "i32": "BYTE_STREAM_SPLIT", + "i64": "BYTE_STREAM_SPLIT", + "f32": "BYTE_STREAM_SPLIT", + "f64": "BYTE_STREAM_SPLIT", + }, + row_group_size_rows=rows_per_rowgroup, + ) + + # now read back in with pyarrow to test it was written properly by cudf + padf2 = pq.read_table(padf_fname) + padf3 = pq.read_table(cdf_fname) + assert_eq(padf2, padf3) + assert_eq(padf2.schema[0].type, padf3.schema[0].type) + + def test_parquet_reader_rle_boolean(datadir): fname = datadir / "rle_boolean_encoding.parquet" From c33e0a349b2d0c2a626364845e616cfd3d04afc6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 24 Jun 2024 17:18:19 +0100 Subject: [PATCH 134/340] Add coverage for both expression and dataframe filter (#16002) Note that expression filter with literals does not work because broadcasting is not implemented. It is also the case that the result could be computed without broadcasting in the case of scalars with some data introspection, but we do not do that here. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/16002 --- .../tests/expressions/test_filter.py | 30 ++++++++++++++----- python/cudf_polars/tests/test_filter.py | 26 ++++++++++++++++ 2 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 python/cudf_polars/tests/test_filter.py diff --git a/python/cudf_polars/tests/expressions/test_filter.py b/python/cudf_polars/tests/expressions/test_filter.py index 783403d764c..1a8e994e3aa 100644 --- a/python/cudf_polars/tests/expressions/test_filter.py +++ b/python/cudf_polars/tests/expressions/test_filter.py @@ -2,19 +2,35 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import pytest + import polars as pl from cudf_polars.testing.asserts import assert_gpu_result_equal -def test_filter(): - ldf = pl.DataFrame( +@pytest.mark.parametrize( + "expr", + [ + pytest.param( + pl.lit(value=False), + marks=pytest.mark.xfail(reason="Expression filter does not handle scalars"), + ), + pl.col("c"), + pl.col("b") > 2, + ], +) +@pytest.mark.parametrize("predicate_pushdown", [False, True]) +def test_filter_expression(expr, predicate_pushdown): + ldf = pl.LazyFrame( { "a": [1, 2, 3, 4, 5, 6, 7], - "b": [1, 1, 1, 1, 1, 1, 1], + "b": [0, 3, 1, 5, 6, 1, 0], + "c": [None, True, False, False, True, True, False], } - ).lazy() + ) - # group-by is just to avoid the filter being pushed into the scan. - query = ldf.group_by(pl.col("a")).agg(pl.col("b").sum()).filter(pl.col("b") < 1) - assert_gpu_result_equal(query) + query = ldf.select(pl.col("a").filter(expr)) + assert_gpu_result_equal( + query, collect_kwargs={"predicate_pushdown": predicate_pushdown} + ) diff --git a/python/cudf_polars/tests/test_filter.py b/python/cudf_polars/tests/test_filter.py new file mode 100644 index 00000000000..f39b348144b --- /dev/null +++ b/python/cudf_polars/tests/test_filter.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("expr", [pl.col("c"), pl.col("b") < 1, pl.lit(value=True)]) +@pytest.mark.parametrize("predicate_pushdown", [False, True]) +def test_filter(expr, predicate_pushdown): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + "c": [True, False, False, True, True, True, None], + } + ).lazy() + + query = ldf.filter(expr) + assert_gpu_result_equal( + query, collect_kwargs={"predicate_pushdown": predicate_pushdown} + ) From f3183c11a71f90cd1096d95f6ded5ecf38b49a55 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 24 Jun 2024 17:24:24 +0100 Subject: [PATCH 135/340] Add full coverage for whole-frame Agg expressions (#15997) Also add more expansive comments on the unreachable paths. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/15997 --- python/cudf_polars/cudf_polars/dsl/expr.py | 58 ++++++++----------- .../cudf_polars/tests/expressions/test_agg.py | 14 +++++ 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c92e0714d54..73f3c1ce289 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -952,7 +952,9 @@ def __init__( self.options = options self.children = (value,) if name not in Agg._SUPPORTED: - raise NotImplementedError(f"Unsupported aggregation {name=}") + raise NotImplementedError( + f"Unsupported aggregation {name=}" + ) # pragma: no cover; all valid aggs are supported # TODO: nan handling in groupby case if name == "min": req = plc.aggregation.min() @@ -978,7 +980,9 @@ def __init__( elif name == "count": req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) else: - raise NotImplementedError + raise NotImplementedError( + f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" + ) # pragma: no cover self.request = req op = getattr(self, f"_{name}", None) if op is None: @@ -988,7 +992,9 @@ def __init__( elif name in {"count", "first", "last"}: pass else: - raise AssertionError + raise NotImplementedError( + f"Unreachable, supported agg {name=} has no implementation" + ) # pragma: no cover self.op = op _SUPPORTED: ClassVar[frozenset[str]] = frozenset( @@ -1010,11 +1016,15 @@ def __init__( def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" if depth >= 1: - raise NotImplementedError("Nested aggregations in groupby") + raise NotImplementedError( + "Nested aggregations in groupby" + ) # pragma: no cover; check_agg trips first (child,) = self.children ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests if self.request is None: - raise NotImplementedError(f"Aggregation {self.name} in groupby") + raise NotImplementedError( + f"Aggregation {self.name} in groupby" + ) # pragma: no cover; __init__ trips first return AggInfo([(expr, self.request, self)]) def _reduce( @@ -1024,10 +1034,7 @@ def _reduce( plc.Column.from_scalar( plc.reduce.reduce(column.obj, request, self.dtype), 1, - ), - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.BEFORE, + ) ) def _count(self, column: Column) -> Column: @@ -1040,10 +1047,7 @@ def _count(self, column: Column) -> Column: ), ), 1, - ), - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.BEFORE, + ) ) def _min(self, column: Column, *, propagate_nans: bool) -> Column: @@ -1054,10 +1058,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column: pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) ), 1, - ), - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.BEFORE, + ) ) if column.nan_count > 0: column = column.mask_nans() @@ -1071,31 +1072,18 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column: pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) ), 1, - ), - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.BEFORE, + ) ) if column.nan_count > 0: column = column.mask_nans() return self._reduce(column, request=plc.aggregation.max()) def _first(self, column: Column) -> Column: - return Column( - plc.copying.slice(column.obj, [0, 1])[0], - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.BEFORE, - ) + return Column(plc.copying.slice(column.obj, [0, 1])[0]) def _last(self, column: Column) -> Column: n = column.obj.size() - return Column( - plc.copying.slice(column.obj, [n - 1, n])[0], - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.BEFORE, - ) + return Column(plc.copying.slice(column.obj, [n - 1, n])[0]) def do_evaluate( self, @@ -1106,7 +1094,9 @@ def do_evaluate( ) -> Column: """Evaluate this expression given a dataframe for context.""" if context is not ExecutionContext.FRAME: - raise NotImplementedError(f"Agg in context {context}") + raise NotImplementedError( + f"Agg in context {context}" + ) # pragma: no cover; unreachable (child,) = self.children return self.op(child.evaluate(df, context=context, mapping=mapping)) diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index b044bbb2885..2ffa1c4af6d 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -56,3 +56,17 @@ def test_agg(df, agg): with pytest.raises(AssertionError): assert_gpu_result_equal(q) assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) + + +@pytest.mark.parametrize( + "propagate_nans", + [pytest.param(False, marks=pytest.mark.xfail(reason="Need to mask nans")), True], + ids=["mask_nans", "propagate_nans"], +) +@pytest.mark.parametrize("op", ["min", "max"]) +def test_agg_float_with_nans(propagate_nans, op): + df = pl.LazyFrame({"a": [1, 2, float("nan")]}) + op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op) + q = df.select(op(pl.col("a"))) + + assert_gpu_result_equal(q) From 0c6b828118fa371e3fd333718bc872085373a076 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Jun 2024 07:05:37 -1000 Subject: [PATCH 136/340] Restrict the allowed pandas timezone objects in cudf (#16013) Since cudf's timezone support is based on the OS's tz data and hence `zoneinfo`, cudf cannot naturally support the variety of timezone objects supported by pandas (`pytz`, `dateutil`, etc). Therefore: * In pandas compatible mode, only accept pandas objects with zoneinfo timezones. * Otherwise, try to convert the pandas timezone to an equivalent zoneinfo object e.g. `pytz.timezone("US/Pacific")`-> `zoneinfo.ZoneInfo("US/Pacific")` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16013 --- python/cudf/cudf/core/_internals/timezones.py | 33 ++++++++++++++- python/cudf/cudf/core/column/column.py | 16 ++++++++ python/cudf/cudf/core/column/datetime.py | 33 +++++++-------- .../tests/indexes/datetime/test_indexing.py | 12 +++--- .../indexes/datetime/test_time_specific.py | 13 +++--- .../cudf/tests/series/test_datetimelike.py | 40 ++++++++++++++++--- 6 files changed, 108 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 269fcf3e37f..29cb9d7bd12 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -1,21 +1,50 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. from __future__ import annotations +import datetime import os import zoneinfo from functools import lru_cache from typing import TYPE_CHECKING, Literal import numpy as np +import pandas as pd +import cudf from cudf._lib.timezone import make_timezone_transition_table -from cudf.core.column.column import as_column if TYPE_CHECKING: from cudf.core.column.datetime import DatetimeColumn from cudf.core.column.timedelta import TimeDeltaColumn +def get_compatible_timezone(dtype: pd.DatetimeTZDtype) -> pd.DatetimeTZDtype: + """Convert dtype.tz object to zoneinfo object if possible.""" + tz = dtype.tz + if isinstance(tz, zoneinfo.ZoneInfo): + return dtype + if cudf.get_option("mode.pandas_compatible"): + raise NotImplementedError( + f"{tz} must be a zoneinfo.ZoneInfo object in pandas_compatible mode." + ) + elif (tzname := getattr(tz, "zone", None)) is not None: + # pytz-like + key = tzname + elif (tz_file := getattr(tz, "_filename", None)) is not None: + # dateutil-like + key = tz_file.split("zoneinfo/")[-1] + elif isinstance(tz, datetime.tzinfo): + # Try to get UTC-like tzinfos + reference = datetime.datetime.now() + key = tz.tzname(reference) + if not (isinstance(key, str) and key.lower() == "utc"): + raise NotImplementedError(f"cudf does not support {tz}") + else: + raise NotImplementedError(f"cudf does not support {tz}") + new_tz = zoneinfo.ZoneInfo(key) + return pd.DatetimeTZDtype(dtype.unit, new_tz) + + @lru_cache(maxsize=20) def get_tz_data(zone_name: str) -> tuple[DatetimeColumn, TimeDeltaColumn]: """ @@ -87,6 +116,8 @@ def _read_tzfile_as_columns( ) if not transition_times_and_offsets: + from cudf.core.column.column import as_column + # this happens for UTC-like zones min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c4e715aeb45..586689e2ee3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -47,6 +47,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 +from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( Buffer, @@ -1854,6 +1855,21 @@ def as_column( arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype), ): + if isinstance(arbitrary.dtype, pd.DatetimeTZDtype): + new_tz = get_compatible_timezone(arbitrary.dtype) + arbitrary = arbitrary.astype(new_tz) + if isinstance(arbitrary.dtype, pd.CategoricalDtype) and isinstance( + arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype + ): + new_tz = get_compatible_timezone( + arbitrary.dtype.categories.dtype + ) + new_cats = arbitrary.dtype.categories.astype(new_tz) + new_dtype = pd.CategoricalDtype( + categories=new_cats, ordered=arbitrary.dtype.ordered + ) + arbitrary = arbitrary.astype(new_dtype) + return as_column( pa.array(arbitrary, from_pandas=True), nan_as_null=nan_as_null, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 9ac761b6be1..d88553361dd 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -21,6 +21,11 @@ from cudf._lib.search import search_sorted from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_220 +from cudf.core._internals.timezones import ( + check_ambiguous_and_nonexistent, + get_compatible_timezone, + get_tz_data, +) from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.dtypes import _get_base_dtype @@ -282,8 +287,6 @@ def __contains__(self, item: ScalarLike) -> bool: @functools.cached_property def time_unit(self) -> str: - if isinstance(self.dtype, pd.DatetimeTZDtype): - return self.dtype.unit return np.datetime_data(self.dtype)[0] @property @@ -725,8 +728,6 @@ def _find_ambiguous_and_nonexistent( transitions occur in the time zone database for the given timezone. If no transitions occur, the tuple `(False, False)` is returned. """ - from cudf.core._internals.timezones import get_tz_data - transition_times, offsets = get_tz_data(zone_name) offsets = offsets.astype(f"timedelta64[{self.time_unit}]") # type: ignore[assignment] @@ -785,26 +786,22 @@ def tz_localize( ambiguous: Literal["NaT"] = "NaT", nonexistent: Literal["NaT"] = "NaT", ): - from cudf.core._internals.timezones import ( - check_ambiguous_and_nonexistent, - get_tz_data, - ) - if tz is None: return self.copy() ambiguous, nonexistent = check_ambiguous_and_nonexistent( ambiguous, nonexistent ) - dtype = pd.DatetimeTZDtype(self.time_unit, tz) + dtype = get_compatible_timezone(pd.DatetimeTZDtype(self.time_unit, tz)) + tzname = dtype.tz.key ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent( - tz + tzname ) localized = self._scatter_by_column( self.isnull() | (ambiguous_col | nonexistent_col), cudf.Scalar(cudf.NaT, dtype=self.dtype), ) - transition_times, offsets = get_tz_data(tz) + transition_times, offsets = get_tz_data(tzname) transition_times_local = (transition_times + offsets).astype( localized.dtype ) @@ -845,7 +842,7 @@ def __init__( offset=offset, null_count=null_count, ) - self._dtype = dtype + self._dtype = get_compatible_timezone(dtype) def to_pandas( self, @@ -865,6 +862,10 @@ def to_arrow(self): self._local_time.to_arrow(), str(self.dtype.tz) ) + @functools.cached_property + def time_unit(self) -> str: + return self.dtype.unit + @property def _utc_time(self): """Return UTC time as naive timestamps.""" @@ -880,8 +881,6 @@ def _utc_time(self): @property def _local_time(self): """Return the local time as naive timestamps.""" - from cudf.core._internals.timezones import get_tz_data - transition_times, offsets = get_tz_data(str(self.dtype.tz)) transition_times = transition_times.astype(_get_base_dtype(self.dtype)) indices = search_sorted([transition_times], [self], "right") - 1 @@ -911,10 +910,6 @@ def __repr__(self): ) def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"): - from cudf.core._internals.timezones import ( - check_ambiguous_and_nonexistent, - ) - if tz is None: return self._local_time ambiguous, nonexistent = check_ambiguous_and_nonexistent( diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py index f2c2d9a263b..ee4d0f7e816 100644 --- a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py +++ b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py @@ -1,4 +1,5 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import zoneinfo import pandas as pd @@ -7,13 +8,10 @@ def test_slice_datetimetz_index(): + tz = zoneinfo.ZoneInfo("US/Eastern") data = ["2001-01-01", "2001-01-02", None, None, "2001-01-03"] - pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize( - "US/Eastern" - ) - idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize( - "US/Eastern" - ) + pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(tz) + idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(tz) expected = pidx[1:4] got = idx[1:4] assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py index b28ef131025..77b32b8ce89 100644 --- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py +++ b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py @@ -1,4 +1,6 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +import zoneinfo + import pandas as pd import cudf @@ -6,24 +8,21 @@ def test_tz_localize(): + tz = zoneinfo.ZoneInfo("America/New_York") pidx = pd.date_range("2001-01-01", "2001-01-02", freq="1s") pidx = pidx.astype(" Date: Mon, 24 Jun 2024 18:25:10 +0100 Subject: [PATCH 137/340] Add tests of expression-based sort and sort-by (#16008) We only need stable vs unstable variants for the sort-by case, since when sorting a single column by itself there is no distinction between stable and unstable. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16008 --- .../tests/expressions/test_sort.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 python/cudf_polars/tests/expressions/test_sort.py diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py new file mode 100644 index 00000000000..0195266f5c6 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import itertools + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("descending", [False, True]) +@pytest.mark.parametrize("nulls_last", [False, True]) +def test_sort_expression(descending, nulls_last): + ldf = pl.LazyFrame( + { + "a": [5, -1, 3, 4, None, 8, 6, 7, None], + } + ) + + query = ldf.select(pl.col("a").sort(descending=descending, nulls_last=nulls_last)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "descending", itertools.combinations_with_replacement([False, True], 3) +) +@pytest.mark.parametrize( + "nulls_last", itertools.combinations_with_replacement([False, True], 3) +) +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +def test_sort_by_expression(descending, nulls_last, maintain_order): + ldf = pl.LazyFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "b": [1, 2, 2, 3, 9, 5, -1, 2, -2, 16], + "c": ["a", "A", "b", "b", "c", "d", "A", "Z", "ä", "̈Ä"], + } + ) + + query = ldf.select( + pl.col("a").sort_by( + pl.col("b"), + pl.col("c"), + pl.col("b") + pl.col("a"), + descending=descending, + nulls_last=nulls_last, + maintain_order=maintain_order, + ) + ) + assert_gpu_result_equal(query, check_row_order=maintain_order) From 4d4cdce2128398444a15f705d05ca062a6f0300f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 24 Jun 2024 18:51:51 +0100 Subject: [PATCH 138/340] Add full coverage of utility functions (#15995) The datetime conversion tests just test that we can round-trip correctly for now. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/15995 --- .../cudf_polars/cudf_polars/utils/dtypes.py | 4 +-- .../cudf_polars/cudf_polars/utils/sorting.py | 4 +-- python/cudf_polars/pyproject.toml | 7 ++++ .../tests/expressions/test_datetime_basic.py | 34 +++++++++++++++++++ python/cudf_polars/tests/utils/test_dtypes.py | 31 +++++++++++++++++ .../cudf_polars/tests/utils/test_sorting.py | 21 ++++++++++++ 6 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 python/cudf_polars/tests/expressions/test_datetime_basic.py create mode 100644 python/cudf_polars/tests/utils/test_dtypes.py create mode 100644 python/cudf_polars/tests/utils/test_sorting.py diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 7b0049daf11..3d4a643e1fc 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -70,7 +70,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) elif dtype.time_unit == "ns": return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS) - assert dtype.time_unit is not None + assert dtype.time_unit is not None # pragma: no cover assert_never(dtype.time_unit) elif isinstance(dtype, pl.Duration): if dtype.time_unit == "ms": @@ -79,7 +79,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: return plc.DataType(plc.TypeId.DURATION_MICROSECONDS) elif dtype.time_unit == "ns": return plc.DataType(plc.TypeId.DURATION_NANOSECONDS) - assert dtype.time_unit is not None + assert dtype.time_unit is not None # pragma: no cover assert_never(dtype.time_unit) elif isinstance(dtype, pl.String): return plc.DataType(plc.TypeId.STRING) diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py index 24fd449dd88..57f94c4ec4c 100644 --- a/python/cudf_polars/cudf_polars/utils/sorting.py +++ b/python/cudf_polars/cudf_polars/utils/sorting.py @@ -43,8 +43,8 @@ def sort_order( for d in descending ] null_precedence = [] - # TODO: use strict=True when we drop py39 - assert len(descending) == len(nulls_last) + if len(descending) != len(nulls_last) or len(descending) != num_keys: + raise ValueError("Mismatching length of arguments in sort_order") for asc, null_last in zip(column_order, nulls_last): if (asc == plc.types.Order.ASCENDING) ^ (not null_last): null_precedence.append(plc.types.NullOrder.AFTER) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index face04b9bd8..effa4861e0c 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -52,6 +52,13 @@ version = {file = "cudf_polars/VERSION"} [tool.pytest.ini_options] xfail_strict = true +[tool.coverage.report] +exclude_also = [ + "if TYPE_CHECKING:", + "class .*\\bProtocol\\):", + "assert_never\\(" +] + [tool.ruff] line-length = 88 indent-width = 4 diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py new file mode 100644 index 00000000000..6ba2a1dce1e --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "dtype", + [ + pl.Date(), + pl.Datetime("ms"), + pl.Datetime("us"), + pl.Datetime("ns"), + pl.Duration("ms"), + pl.Duration("us"), + pl.Duration("ns"), + ], + ids=repr, +) +def test_datetime_dataframe_scan(dtype): + ldf = pl.DataFrame( + { + "a": pl.Series([1, 2, 3, 4, 5, 6, 7], dtype=dtype), + "b": pl.Series([3, 4, 5, 6, 7, 8, 9], dtype=pl.UInt16), + } + ).lazy() + + query = ldf.select(pl.col("b"), pl.col("a")) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py new file mode 100644 index 00000000000..535fdd846a0 --- /dev/null +++ b/python/cudf_polars/tests/utils/test_dtypes.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.utils.dtypes import from_polars + + +@pytest.mark.parametrize( + "pltype", + [ + pl.Time(), + pl.Struct({"a": pl.Int8, "b": pl.Float32}), + pl.Datetime("ms", time_zone="US/Pacific"), + pl.Array(pl.Int8, 2), + pl.Binary(), + pl.Categorical(), + pl.Enum(["a", "b"]), + pl.Field("a", pl.Int8), + pl.Object(), + pl.Unknown(), + ], + ids=repr, +) +def test_unhandled_dtype_conversion_raises(pltype): + with pytest.raises(NotImplementedError): + _ = from_polars(pltype) diff --git a/python/cudf_polars/tests/utils/test_sorting.py b/python/cudf_polars/tests/utils/test_sorting.py new file mode 100644 index 00000000000..4e98a3a7ce7 --- /dev/null +++ b/python/cudf_polars/tests/utils/test_sorting.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +from cudf_polars.utils.sorting import sort_order + + +@pytest.mark.parametrize( + "descending,nulls_last,num_keys", + [ + ([True], [False, True], 3), + ([True, True], [False, True, False], 3), + ([False, True], [True], 3), + ], +) +def test_sort_order_raises_mismatch(descending, nulls_last, num_keys): + with pytest.raises(ValueError): + _ = sort_order(descending, nulls_last=nulls_last, num_keys=num_keys) From 9987410c4baa275c9ae46801112bc4b6d8d6b057 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 24 Jun 2024 11:16:56 -0700 Subject: [PATCH 139/340] Account for FIXED_LEN_BYTE_ARRAY when calculating fragment sizes in Parquet writer (#16064) The number of rows per fragment will be off by a factor of 4 for FIXED_LEN_BYTE_ARRAY columns. This results in many more fragments than are necessary to achieve user requested page size limits. This PR shifts where the determination of whether a column has fixed-width data to a location where knowledge of the schema can be used. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/16064 --- cpp/src/io/parquet/writer_impl.cu | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ca15b532d07..bed4dbc5a66 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -296,19 +296,6 @@ size_t column_size(column_view const& column, rmm::cuda_stream_view stream) CUDF_FAIL("Unexpected compound type"); } -// checks to see if the given column has a fixed size. This doesn't -// check every row, so assumes string and list columns are not fixed, even -// if each row is the same width. -// TODO: update this if FIXED_LEN_BYTE_ARRAY is ever supported for writes. -bool is_col_fixed_width(column_view const& column) -{ - if (column.type().id() == type_id::STRUCT) { - return std::all_of(column.child_begin(), column.child_end(), is_col_fixed_width); - } - - return is_fixed_width(column.type()); -} - /** * @brief Extends SchemaElement to add members required in constructing parquet_column_view * @@ -946,6 +933,15 @@ struct parquet_column_view { return schema_node.converted_type.value_or(UNKNOWN); } + // Checks to see if the given column has a fixed-width data type. This doesn't + // check every value, so it assumes string and list columns are not fixed-width, even + // if each value has the same size. + [[nodiscard]] bool is_fixed_width() const + { + // lists and strings are not fixed width + return max_rep_level() == 0 and physical_type() != Type::BYTE_ARRAY; + } + std::vector const& get_path_in_schema() { return path_in_schema; } // LIST related member functions @@ -1764,7 +1760,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // unbalanced in final page sizes, so using 4 which seems to be a good // compromise at smoothing things out without getting fragment sizes too small. auto frag_size_fn = [&](auto const& col, size_t col_size) { - int const target_frags_per_page = is_col_fixed_width(col) ? 1 : 4; + int const target_frags_per_page = col.is_fixed_width() ? 1 : 4; auto const avg_len = target_frags_per_page * util::div_rounding_up_safe(col_size, input.num_rows()); if (avg_len > 0) { @@ -1775,8 +1771,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } }; - std::transform(single_streams_table.begin(), - single_streams_table.end(), + std::transform(parquet_columns.begin(), + parquet_columns.end(), column_sizes.begin(), column_frag_size.begin(), frag_size_fn); From f583879e2fb90c104dee259b676e836ed6e60ca0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Mon, 24 Jun 2024 13:40:08 -0500 Subject: [PATCH 140/340] More safely parse CUDA versions when subprocess output is contaminated (#16067) In some user environments, calling a subprocess may produce output that confuses the version parsing machinery inside `_ptxcompiler`. Since the affected functions are vendored from the real `ptxcompiler` package for the purposes of using them with CUDA 12, this fix will only these situations for CUDA 12+. Closes https://github.com/rapidsai/cudf/issues/16016. Authors: - https://github.com/brandon-b-miller Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16067 --- python/cudf/cudf/utils/_ptxcompiler.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py index 54f5ea08ee1..9d7071d55a5 100644 --- a/python/cudf/cudf/utils/_ptxcompiler.py +++ b/python/cudf/cudf/utils/_ptxcompiler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,11 +14,14 @@ import math import os +import re import subprocess import sys import warnings NO_DRIVER = (math.inf, math.inf) +START_TAG = "_VER_START" +END_TAG = "_VER_END" NUMBA_CHECK_VERSION_CMD = """\ from ctypes import c_int, byref @@ -28,7 +31,7 @@ drv_major = dv.value // 1000 drv_minor = (dv.value - (drv_major * 1000)) // 10 run_major, run_minor = cuda.runtime.get_version() -print(f'{drv_major} {drv_minor} {run_major} {run_minor}') +print(f'_VER_START{drv_major} {drv_minor} {run_major} {run_minor}_VER_END') """ @@ -61,7 +64,11 @@ def get_versions(): warnings.warn(msg, UserWarning) return NO_DRIVER - versions = [int(s) for s in cp.stdout.strip().split()] + pattern = r"_VER_START(.*?)_VER_END" + + ver_str = re.search(pattern, cp.stdout.decode()).group(1) + + versions = [int(s) for s in ver_str.strip().split()] driver_version = tuple(versions[:2]) runtime_version = tuple(versions[2:]) From bd76bf6b293b7f17a846df8392c18d92ced2b40f Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Mon, 24 Jun 2024 13:43:33 -0500 Subject: [PATCH 141/340] cuDF/libcudf exponentially weighted moving averages (#9027) Adds an exponentially weighted moving average aggregation to `cudf::scan` and plumbs it up through `cudf.Series.ewm`, similar to `pandas.Series.ewm`. partially resolves https://github.com/rapidsai/cudf/issues/1263 Authors: - https://github.com/brandon-b-miller - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9027 --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/aggregation.hpp | 41 ++- .../cudf/detail/aggregation/aggregation.hpp | 44 +++ cpp/src/aggregation/aggregation.cpp | 22 ++ cpp/src/reductions/scan/ewm.cu | 330 ++++++++++++++++++ cpp/src/reductions/scan/scan.cuh | 7 + cpp/src/reductions/scan/scan_inclusive.cu | 3 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/reductions/ewm_tests.cpp | 101 ++++++ .../source/user_guide/api_docs/dataframe.rst | 1 + .../source/user_guide/api_docs/series.rst | 1 + python/cudf/cudf/_lib/aggregation.pyx | 8 + .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 3 + .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 26 ++ .../_lib/pylibcudf/libcudf/aggregation.pxd | 8 + python/cudf/cudf/core/indexed_frame.py | 28 +- python/cudf/cudf/core/window/__init__.py | 4 +- python/cudf/cudf/core/window/ewm.py | 200 +++++++++++ python/cudf/cudf/core/window/rolling.py | 22 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 2 +- python/cudf/cudf/tests/test_ewm.py | 46 +++ 21 files changed, 892 insertions(+), 7 deletions(-) create mode 100644 cpp/src/reductions/scan/ewm.cu create mode 100644 cpp/tests/reductions/ewm_tests.cpp create mode 100644 python/cudf/cudf/core/window/ewm.py create mode 100644 python/cudf/cudf/tests/test_ewm.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index aab0a9b2d49..5fd68bfb26c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -502,6 +502,7 @@ add_library( src/reductions/product.cu src/reductions/reductions.cpp src/reductions/scan/rank_scan.cu + src/reductions/scan/ewm.cu src/reductions/scan/scan.cpp src/reductions/scan/scan_exclusive.cu src/reductions/scan/scan_inclusive.cu diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index d458c831f19..3c1023017be 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -103,6 +103,7 @@ class aggregation { NUNIQUE, ///< count number of unique elements NTH_ELEMENT, ///< get the nth element ROW_NUMBER, ///< get row-number of current index (relative to rolling window) + EWMA, ///< get exponential weighted moving average at current index RANK, ///< get rank of current index COLLECT_LIST, ///< collect values into a list COLLECT_SET, ///< collect values into a list without duplicate entries @@ -250,6 +251,8 @@ class segmented_reduce_aggregation : public virtual aggregation { enum class udf_type : bool { CUDA, PTX }; /// Type of correlation method. enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN }; +/// Type of treatment of EWM input values' first value +enum class ewm_history : int32_t { INFINITE, FINITE }; /// Factory to create a SUM aggregation /// @return A SUM aggregation object @@ -411,6 +414,42 @@ std::unique_ptr make_nth_element_aggregation( template std::unique_ptr make_row_number_aggregation(); +/** + * @brief Factory to create an EWMA aggregation + * + * `EWMA` returns a non-nullable column with the same type as the input, + * whose values are the exponentially weighted moving average of the input + * sequence. Let these values be known as the y_i. + * + * EWMA aggregations are parameterized by a center of mass (`com`) which + * affects the contribution of the previous values (y_0 ... y_{i-1}) in + * computing the y_i. + * + * EWMA aggregations are also parameterized by a history `cudf::ewm_history`. + * Special considerations have to be given to the mathematical treatment of + * the first value of the input sequence. There are two approaches to this, + * one which considers the first value of the sequence to be the exponential + * weighted moving average of some infinite history of data, and one which + * takes the first value to be the only datapoint known. These assumptions + * lead to two different formulas for the y_i. `ewm_history` selects which. + * + * EWMA aggregations have special null handling. Nulls have two effects. The + * first is to propagate forward the last valid value as far as it has been + * computed. This could be thought of as the nulls not affecting the average + * in any way. The second effect changes the way the y_i are computed. Since + * a moving average is conceptually designed to weight contributing values by + * their recency, nulls ought to count as valid periods even though they do + * not change the average. For example, if the input sequence is {1, NULL, 3} + * then when computing y_2 one should weigh y_0 as if it occurs two periods + * before y_2 rather than just one. + * + * @param center_of_mass the center of mass. + * @param history which assumption to make about the first value + * @return A EWM aggregation object + */ +template +std::unique_ptr make_ewma_aggregation(double const center_of_mass, ewm_history history); + /** * @brief Factory to create a RANK aggregation * diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index edee83783b8..843414817e3 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -76,6 +76,8 @@ class simple_aggregations_collector { // Declares the interface for the simple class nth_element_aggregation const& agg); virtual std::vector> visit(data_type col_type, class row_number_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class ewma_aggregation const& agg); virtual std::vector> visit(data_type col_type, class rank_aggregation const& agg); virtual std::vector> visit( @@ -141,6 +143,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class correlation_aggregation const& agg); virtual void visit(class tdigest_aggregation const& agg); virtual void visit(class merge_tdigest_aggregation const& agg); + virtual void visit(class ewma_aggregation const& agg); }; /** @@ -667,6 +670,40 @@ class row_number_aggregation final : public rolling_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +/** + * @brief Derived class for specifying an ewma aggregation + */ +class ewma_aggregation final : public scan_aggregation { + public: + double const center_of_mass; + cudf::ewm_history history; + + ewma_aggregation(double const center_of_mass, cudf::ewm_history history) + : aggregation{EWMA}, center_of_mass{center_of_mass}, history{history} + { + } + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + + bool is_equal(aggregation const& _other) const override + { + if (!this->aggregation::is_equal(_other)) { return false; } + auto const& other = dynamic_cast(_other); + return this->center_of_mass == other.center_of_mass and this->history == other.history; + } + + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived class for specifying a rank aggregation */ @@ -1336,6 +1373,11 @@ struct target_type_impl { using type = size_type; }; +template +struct target_type_impl { + using type = double; +}; + // Always use size_type accumulator for RANK template struct target_type_impl { @@ -1536,6 +1578,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind return f.template operator()(std::forward(args)...); case aggregation::MERGE_TDIGEST: return f.template operator()(std::forward(args)...); + case aggregation::EWMA: + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index adee9147740..5422304c5cb 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -154,6 +154,12 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, ewma_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + std::vector> simple_aggregations_collector::visit( data_type col_type, rank_aggregation const& agg) { @@ -333,6 +339,11 @@ void aggregation_finalizer::visit(row_number_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(ewma_aggregation const& agg) +{ + visit(static_cast(agg)); +} + void aggregation_finalizer::visit(rank_aggregation const& agg) { visit(static_cast(agg)); @@ -665,6 +676,17 @@ std::unique_ptr make_row_number_aggregation() template std::unique_ptr make_row_number_aggregation(); template std::unique_ptr make_row_number_aggregation(); +/// Factory to create an EWMA aggregation +template +std::unique_ptr make_ewma_aggregation(double const com, cudf::ewm_history history) +{ + return std::make_unique(com, history); +} +template std::unique_ptr make_ewma_aggregation(double const com, + cudf::ewm_history history); +template std::unique_ptr make_ewma_aggregation( + double const com, cudf::ewm_history history); + /// Factory to create a RANK aggregation template std::unique_ptr make_rank_aggregation(rank_method method, diff --git a/cpp/src/reductions/scan/ewm.cu b/cpp/src/reductions/scan/ewm.cu new file mode 100644 index 00000000000..3fa2de450ad --- /dev/null +++ b/cpp/src/reductions/scan/ewm.cu @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "scan.cuh" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace cudf { +namespace detail { + +template +using pair_type = thrust::pair; + +/** + * @brief functor to be summed over in a prefix sum such that + * the recurrence in question is solved. See + * G. E. Blelloch. Prefix sums and their applications. Technical Report + * CMU-CS-90-190, Nov. 1990. S. 1.4 + * for details + */ +template +class recurrence_functor { + public: + __device__ pair_type operator()(pair_type ci, pair_type cj) + { + return {ci.first * cj.first, ci.second * cj.first + cj.second}; + } +}; + +template +struct ewma_functor_base { + T beta; + const pair_type IDENTITY{1.0, 0.0}; +}; + +template +struct ewma_adjust_nulls_functor : public ewma_functor_base { + __device__ pair_type operator()(thrust::tuple const data) + { + // Not const to allow for updating the input value + auto [valid, exp, input] = data; + if (!valid) { return this->IDENTITY; } + if constexpr (not is_numerator) { input = 1; } + + // The value is non-null, but nulls preceding it + // must adjust the second element of the pair + T const beta = this->beta; + return {beta * ((exp != 0) ? pow(beta, exp) : 1), input}; + } +}; + +template +struct ewma_adjust_no_nulls_functor : public ewma_functor_base { + __device__ pair_type operator()(T const data) + { + T const beta = this->beta; + if constexpr (is_numerator) { + return {beta, data}; + } else { + return {beta, 1.0}; + } + } +}; + +template +struct ewma_noadjust_nulls_functor : public ewma_functor_base { + /* + In the null case, a denominator actually has to be computed. The formula is + y_{i+1} = (1 - alpha)x_{i-1} + alpha x_i, but really there is a "denominator" + which is the sum of the weights: alpha + (1 - alpha) == 1. If a null is + encountered, that means that the "previous" value is downweighted by a + factor (for each missing value). For example with a single null: + data = {x_0, NULL, x_1}, + y_2 = (1 - alpha)**2 x_0 + alpha * x_2 / (alpha + (1-alpha)**2) + + As such, the pairs must be updated before summing like the adjusted case to + properly downweight the previous values. But now but we also need to compute + the normalization factors and divide the results into them at the end. + */ + __device__ pair_type operator()(thrust::tuple const data) + { + T const beta = this->beta; + auto const [input, index, valid, nullcnt] = data; + if (index == 0) { + return {beta, input}; + } else { + if (!valid) { return this->IDENTITY; } + // preceding value is valid, return normal pair + if (nullcnt == 0) { return {beta, (1.0 - beta) * input}; } + // one or more preceding values is null, adjust by how many + T const factor = (1.0 - beta) + pow(beta, nullcnt + 1); + return {(beta * (pow(beta, nullcnt)) / factor), ((1.0 - beta) * input) / factor}; + } + } +}; + +template +struct ewma_noadjust_no_nulls_functor : public ewma_functor_base { + __device__ pair_type operator()(thrust::tuple const data) + { + T const beta = this->beta; + auto const [input, index] = data; + if (index == 0) { + return {beta, input}; + } else { + return {beta, (1.0 - beta) * input}; + } + } +}; + +/** +* @brief Return an array whose values y_i are the number of null entries +* in between the last valid entry of the input and the current index. +* Example: {1, NULL, 3, 4, NULL, NULL, 7} + -> {0, 0 1, 0, 0, 1, 2} +*/ +rmm::device_uvector null_roll_up(column_view const& input, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector output(input.size(), stream); + + auto device_view = column_device_view::create(input); + auto invalid_it = thrust::make_transform_iterator( + cudf::detail::make_validity_iterator(*device_view), + cuda::proclaim_return_type([] __device__(int valid) -> int { return 1 - valid; })); + + // valid mask {1, 0, 1, 0, 0, 1} leads to output array {0, 0, 1, 0, 1, 2} + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + invalid_it, + invalid_it + input.size() - 1, + invalid_it, + std::next(output.begin())); + return output; +} + +template +rmm::device_uvector compute_ewma_adjust(column_view const& input, + T const beta, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + rmm::device_uvector output(input.size(), stream); + rmm::device_uvector> pairs(input.size(), stream); + + if (input.has_nulls()) { + rmm::device_uvector nullcnt = null_roll_up(input, stream); + auto device_view = column_device_view::create(input); + auto valid_it = cudf::detail::make_validity_iterator(*device_view); + auto data = + thrust::make_zip_iterator(thrust::make_tuple(valid_it, nullcnt.begin(), input.begin())); + + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + data, + data + input.size(), + pairs.begin(), + ewma_adjust_nulls_functor{beta}, + recurrence_functor{}); + thrust::transform(rmm::exec_policy(stream), + pairs.begin(), + pairs.end(), + output.begin(), + [] __device__(pair_type pair) -> T { return pair.second; }); + + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + data, + data + input.size(), + pairs.begin(), + ewma_adjust_nulls_functor{beta}, + recurrence_functor{}); + + } else { + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + input.begin(), + input.end(), + pairs.begin(), + ewma_adjust_no_nulls_functor{beta}, + recurrence_functor{}); + thrust::transform(rmm::exec_policy(stream), + pairs.begin(), + pairs.end(), + output.begin(), + [] __device__(pair_type pair) -> T { return pair.second; }); + auto itr = thrust::make_counting_iterator(0); + + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + itr, + itr + input.size(), + pairs.begin(), + ewma_adjust_no_nulls_functor{beta}, + recurrence_functor{}); + } + + thrust::transform( + rmm::exec_policy(stream), + pairs.begin(), + pairs.end(), + output.begin(), + output.begin(), + [] __device__(pair_type pair, T numerator) -> T { return numerator / pair.second; }); + + return output; +} + +template +rmm::device_uvector compute_ewma_noadjust(column_view const& input, + T const beta, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + rmm::device_uvector output(input.size(), stream); + rmm::device_uvector> pairs(input.size(), stream); + rmm::device_uvector nullcnt = + [&input, stream]() -> rmm::device_uvector { + if (input.has_nulls()) { + return null_roll_up(input, stream); + } else { + return rmm::device_uvector(input.size(), stream); + } + }(); + // denominators are all 1 and do not need to be computed + // pairs are all (beta, 1-beta x_i) except for the first one + + if (!input.has_nulls()) { + auto data = thrust::make_zip_iterator( + thrust::make_tuple(input.begin(), thrust::make_counting_iterator(0))); + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + data, + data + input.size(), + pairs.begin(), + ewma_noadjust_no_nulls_functor{beta}, + recurrence_functor{}); + + } else { + auto device_view = column_device_view::create(input); + auto valid_it = detail::make_validity_iterator(*device_view); + + auto data = thrust::make_zip_iterator(thrust::make_tuple( + input.begin(), thrust::make_counting_iterator(0), valid_it, nullcnt.begin())); + + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + data, + data + input.size(), + pairs.begin(), + ewma_noadjust_nulls_functor{beta}, + recurrence_functor()); + } + + // copy the second elements to the output for now + thrust::transform(rmm::exec_policy(stream), + pairs.begin(), + pairs.end(), + output.begin(), + [] __device__(pair_type pair) -> T { return pair.second; }); + return output; +} + +struct ewma_functor { + template ::value)> + std::unique_ptr operator()(scan_aggregation const& agg, + column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + { + CUDF_FAIL("Unsupported type for EWMA."); + } + + template ::value)> + std::unique_ptr operator()(scan_aggregation const& agg, + column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + { + auto const ewma_agg = dynamic_cast(&agg); + auto const history = ewma_agg->history; + auto const center_of_mass = ewma_agg->center_of_mass; + + // center of mass is easier for the user, but the recurrences are + // better expressed in terms of the derived parameter `beta` + T const beta = center_of_mass / (center_of_mass + 1.0); + + auto result = [&]() { + if (history == cudf::ewm_history::INFINITE) { + return compute_ewma_adjust(input, beta, stream, mr); + } else { + return compute_ewma_noadjust(input, beta, stream, mr); + } + }(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + input.size(), + result.release(), + rmm::device_buffer{}, + 0); + } +}; + +std::unique_ptr exponentially_weighted_moving_average(column_view const& input, + scan_aggregation const& agg, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return type_dispatcher(input.type(), ewma_functor{}, agg, input, stream, mr); +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh index aeb9e516cd4..6c237741ac3 100644 --- a/cpp/src/reductions/scan/scan.cuh +++ b/cpp/src/reductions/scan/scan.cuh @@ -36,6 +36,12 @@ std::pair mask_scan(column_view const& input_view rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +// exponentially weighted moving average of the input +std::unique_ptr exponentially_weighted_moving_average(column_view const& input, + scan_aggregation const& agg, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + template