diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 7a1361e52c5..d86fc0e550a 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 64d7cd54130..66a3b22df11 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json index c1924243506..2a195c6c81d 100644 --- a/.devcontainer/cuda12.5-conda/devcontainer.json +++ b/.devcontainer/cuda12.5-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json index beab2940176..125c85cefa9 100644 --- a/.devcontainer/cuda12.5-pip/devcontainer.json +++ b/.devcontainer/cuda12.5-pip/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.github/labeler.yml b/.github/labeler.yml index 90cdda4d3ca..8506d38a048 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -12,7 +12,7 @@ cudf.polars: - 'python/cudf_polars/**' pylibcudf: - - 'python/cudf/pylibcudf/**' + - 'python/pylibcudf/**' libcudf: - 'cpp/**' diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d6d3e3fdd33..c034752d373 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,19 +57,19 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" sha: ${{ inputs.sha }} wheel-build-libcudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -81,7 +81,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -92,7 +92,7 @@ jobs: wheel-build-pylibcudf: needs: [wheel-publish-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -102,7 +102,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -113,7 +113,7 @@ jobs: wheel-build-cudf: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -134,7 +134,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -157,7 +157,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -169,7 +169,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index d670132cca9..c676032779f 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,7 +17,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a4a8f036174..7ec48eb7817 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -30,6 +30,7 @@ jobs: - wheel-tests-cudf - wheel-build-cudf-polars - wheel-tests-cudf-polars + - cudf-polars-polars-tests - wheel-build-dask-cudf - wheel-tests-dask-cudf - devcontainer @@ -37,7 +38,7 @@ jobs: - pandas-tests - pandas-tests-diff secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 if: always() with: needs: ${{ toJSON(needs) }} @@ -52,7 +53,7 @@ jobs: steps: - name: Get PR info id: get-pr-info - uses: rapidsai/shared-actions/get-pr-info@main + uses: nv-gha-runners/get-pr-info@main - name: Checkout code repo uses: actions/checkout@v4 with: @@ -104,39 +105,39 @@ jobs: - '!notebooks/**' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 if: needs.changed-files.outputs.test_cpp == 'true' with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: pull-request conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -145,7 +146,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -153,18 +154,18 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 if: needs.changed-files.outputs.test_java == 'true' with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_java.sh" static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -174,28 +175,28 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 if: needs.changed-files.outputs.test_notebooks == 'true' with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -204,21 +205,21 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -226,7 +227,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -235,7 +236,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -244,10 +245,21 @@ jobs: # This always runs, but only fails if this PR touches code in # pylibcudf or cudf_polars script: "ci/test_wheel_cudf_polars.sh" + cudf-polars-polars-tests: + needs: wheel-build-cudf-polars + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + # This always runs, but only fails if this PR touches code in + # pylibcudf or cudf_polars + script: "ci/test_cudf_polars_polars_tests.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -256,7 +268,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -265,7 +277,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12 with: arch: '["amd64"]' cuda: '["12.5"]' @@ -276,7 +288,7 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -287,7 +299,7 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -299,7 +311,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index fe77ad4b6b2..af8d1289ea1 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.12 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.12 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4af6a0d690d..a22d3c5b9cc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -41,11 +41,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -54,7 +54,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +64,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -81,11 +81,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -93,11 +93,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -117,7 +117,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -126,7 +126,7 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/README.md b/README.md index 8f8c2adac2f..169d2e4eded 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=24.10 python=3.12 cuda-version=12.5 + cudf=24.12 python=3.12 cuda-version=12.5 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION index 7c7ba04436f..af28c42b528 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -24.10.00 +24.12.00 diff --git a/build.sh b/build.sh index 211e1db9fbf..69d6481af42 100755 --- a/build.sh +++ b/build.sh @@ -239,11 +239,6 @@ if hasArg --pydevelop; then PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e" fi -# Append `-DFIND_CUDF_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option. -if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON" -fi - if hasArg --disable_large_strings; then BUILD_DISABLE_LARGE_STRINGS="ON" fi diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index 93a815838b7..7a12db927e5 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -68,8 +68,18 @@ def emoji_failed(x): pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() diff_df = pr_df - main_df +total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call'] +pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1) +pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1) -pr_df = pr_df[["total", "passed", "failed", "skipped"]] +cpu_usage_mean = pr_df['CPU Usage'].mean().round(2) +gpu_usage_mean = pr_df['GPU Usage'].mean().round(2) + +# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns +pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%' +pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%' + +pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']] diff_df = diff_df[["total", "passed", "failed", "skipped"]] diff_df.columns = diff_df.columns + "_diff" diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) @@ -95,6 +105,8 @@ def emoji_failed(x): print(comment) print() +print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%") +print() print("Here are the results of running the Pandas tests against this PR:") print() print(df.to_markdown()) diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index c6228a4ef33..f6bdc6f9484 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -56,10 +56,10 @@ else echo "" > ./constraints.txt if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]` + # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]` rapids-dependency-file-generator \ --output requirements \ - --file-key test_python \ + --file-key test_python_cudf_pandas \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ | tee ./constraints.txt fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index be55b49870f..f73e88bc0c8 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -25,9 +25,9 @@ NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}') NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} # Need to distutils-normalize the versions for some use cases -CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))") -NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") -PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))") +CURRENT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${CURRENT_SHORT_TAG}'))") +NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))") +PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))") echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" @@ -45,6 +45,8 @@ sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh DEPENDENCIES=( cudf cudf_kafka + cugraph + cuml custreamz dask-cuda dask-cudf @@ -57,7 +59,7 @@ DEPENDENCIES=( rmm ) for DEP in "${DEPENDENCIES[@]}"; do - for FILE in dependencies.yaml conda/environments/*.yaml; do + for FILE in dependencies.yaml conda/environments/*.yaml python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml; do sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}" done for FILE in python/*/pyproject.toml; do diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh new file mode 100755 index 00000000000..52a827af94c --- /dev/null +++ b/ci/run_cudf_polars_polars_tests.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# Support invoking run_cudf_polars_pytests.sh outside the script directory +# Assumption, polars has been cloned in the root of the repo. +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/ + +DESELECTED_TESTS=( + "tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place + "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode + "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error + "tests/docs/test_user_guide.py" # No dot binary in CI image +) + +DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}") +python -m pytest \ + --import-mode=importlib \ + --cache-clear \ + -m "" \ + -p cudf_polars.testing.plugin \ + -v \ + --tb=short \ + ${DESELECTED_TESTS} \ + "$@" \ + py-polars/tests diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh new file mode 100755 index 00000000000..6c728a9537f --- /dev/null +++ b/ci/test_cudf_polars_polars_tests.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -eou pipefail + +# We will only fail these tests if the PR touches code in pylibcudf +# or cudf_polars itself. +# Note, the three dots mean we are doing diff between the merge-base +# of upstream and HEAD. So this is asking, "does _this branch_ touch +# files in cudf_polars/pylibcudf", rather than "are there changes +# between upstream and this branch which touch cudf_polars/pylibcudf" +# TODO: is the target branch exposed anywhere in an environment variable? +if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; +then + HAS_CHANGES=1 + rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" +else + HAS_CHANGES=0 + rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" +fi + +rapids-logger "Download wheels" + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist + +# Download the pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep + +rapids-logger "Install pylibcudf" +python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl + +rapids-logger "Install cudf_polars" +python -m pip install $(echo ./dist/cudf_polars*.whl) + +# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') +TAG="py-1.7.0" +rapids-logger "Clone polars to ${TAG}" +git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1 + +# Install requirements for running polars tests +rapids-logger "Install polars test requirements" +python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt + +function set_exitcode() +{ + EXITCODE=$? +} +EXITCODE=0 +trap set_exitcode ERR +set +e + +rapids-logger "Run polars tests" +./ci/run_cudf_polars_polars_tests.sh + +trap ERR +set -e + +if [ ${EXITCODE} != 0 ]; then + rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}" +else + rapids-logger "Running polars test suite PASSED" +fi + +if [ ${HAS_CHANGES} == 1 ]; then + exit ${EXITCODE} +else + exit 0 +fi diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index d0675b0431a..dc70661a17a 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -10,10 +10,10 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" ENV_YAML_DIR="$(mktemp -d)" - +FILE_KEY=$1 rapids-dependency-file-generator \ --output conda \ - --file-key test_python \ + --file-key ${FILE_KEY} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ | tee "${ENV_YAML_DIR}/env.yaml" diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index ae34047e87f..2386414b32e 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -5,7 +5,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../; # Common setup steps shared by Python test jobs -source ./ci/test_python_common.sh +source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index 06a24773cae..67c97ad29a5 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -5,7 +5,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ # Common setup steps shared by Python test jobs -source ./ci/test_python_common.sh +source ./ci/test_python_common.sh test_python_other rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 28ded2f8e0f..a701bfe15e0 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf" pushd python/pylibcudf/pylibcudf/tests python -m pytest \ --cache-clear \ + --numprocesses=8 \ --dist=worksteal \ . popd diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 9844090258a..a36e8734adc 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -10,13 +10,17 @@ set -eou pipefail # files in cudf_polars/pylibcudf", rather than "are there changes # between upstream and this branch which touch cudf_polars/pylibcudf" # TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; +if [ -n "$(git diff --name-only origin/branch-24.12...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; then HAS_CHANGES=1 + rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" else HAS_CHANGES=0 + rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" fi +rapids-logger "Download wheels" + RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist @@ -43,6 +47,9 @@ python -m pip install \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" +rapids-logger "Pin to 1.7.0 Temporarily" +python -m pip install polars==1.7.0 + rapids-logger "Run cudf_polars tests" function set_exitcode() diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 0d39807d56c..361a42ccda9 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd @@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index c96e8706d27..f91bf1e7046 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -26,12 +26,12 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.10.*,>=0.0.0a0 +- dask-cuda==24.12.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 -- fmt>=10.1.1,<11 +- fmt>=11.0.2,<12 - fsspec>=0.6.0 - gcc_linux-64=11.* - hypothesis @@ -42,9 +42,9 @@ dependencies: - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 -- libkvikio==24.10.*,>=0.0.0a0 +- libkvikio==24.12.*,>=0.0.0a0 - librdkafka>=2.5.0,<2.6.0a0 -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - make - moto>=4.0.8 - msgpack-python @@ -78,13 +78,13 @@ dependencies: - python>=3.10,<3.13 - pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==24.10.*,>=0.0.0a0 +- rapids-dask-dependency==24.12.*,>=0.0.0a0 - rich -- rmm==24.10.*,>=0.0.0a0 +- rmm==24.12.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.12.0,<1.13 +- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index e54a44d9f6e..f4ec6bd5407 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -27,12 +27,12 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.10.*,>=0.0.0a0 +- dask-cuda==24.12.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 -- fmt>=10.1.1,<11 +- fmt>=11.0.2,<12 - fsspec>=0.6.0 - gcc_linux-64=11.* - hypothesis @@ -41,9 +41,9 @@ dependencies: - jupyter_client - libcufile-dev - libcurand-dev -- libkvikio==24.10.*,>=0.0.0a0 +- libkvikio==24.12.*,>=0.0.0a0 - librdkafka>=2.5.0,<2.6.0a0 -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - make - moto>=4.0.8 - msgpack-python @@ -76,13 +76,13 @@ dependencies: - python>=3.10,<3.13 - pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==24.10.*,>=0.0.0a0 +- rapids-dask-dependency==24.12.*,>=0.0.0a0 - rich -- rmm==24.10.*,>=0.0.0a0 +- rmm==24.12.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.12.0,<1.13 +- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 33fa4b4eccf..dc75eb4b252 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -26,13 +26,13 @@ librdkafka_version: - ">=2.5.0,<2.6.0a0" fmt_version: - - ">=10.1.1,<11" + - ">=11.0.2,<12" flatbuffers_version: - "=24.3.25" spdlog_version: - - ">=1.12.0,<1.13" + - ">=1.14.1,<1.15" nvcomp_version: - "=4.0.1" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7bc01e64441..84b462bb884 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,7 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp + src/io/json/host_tree_algorithms.cu src/io/json/json_column.cu src/io/json/json_normalization.cu src/io/json/json_tree.cu @@ -797,7 +798,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm $ + PUBLIC CCCL::CCCL rmm::rmm $ spdlog::spdlog_header_only PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio $ nanoarrow ) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 3bf9d02b384..abc6f74fccf 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -177,11 +177,11 @@ ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp) # ################################################################################################## # * nds-h benchmark -------------------------------------------------------------------------------- -ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q01_NVBENCH ndsh/q01.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q05_NVBENCH ndsh/q05.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp) # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- @@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp index d4368906702..54d177df401 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp +++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp @@ -85,7 +85,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu auto const left_selected = left_input.select(left_on); auto const right_selected = right_input.select(right_on); auto const [left_join_indices, right_join_indices] = - cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr); + cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, stream, mr); auto const left_indices_span = cudf::device_span{*left_join_indices}; auto const right_indices_span = cudf::device_span{*right_join_indices}; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 7563c823454..ce115fd7723 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64; void parquet_read_common(cudf::size_type num_rows_to_read, cudf::size_type num_cols_to_read, cuio_source_sink_pair& source_sink, - nvbench::state& state) + nvbench::state& state, + size_t table_data_size = data_size) { cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); @@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read, }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(static_cast(table_data_size) / time, "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); @@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list +void BM_parquet_read_wide_tables(nvbench::state& state, + nvbench::type_list> type_list) +{ + auto const d_type = get_type_or_group(static_cast(DataType)); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + +void BM_parquet_read_wide_tables_mixed(nvbench::state& state) +{ + auto const d_type = []() { + auto d_type1 = get_type_or_group(static_cast(data_type::INTEGRAL)); + auto d_type2 = get_type_or_group(static_cast(data_type::FLOAT)); + d_type1.reserve(d_type1.size() + d_type2.size()); + std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1)); + return d_type1; + }(); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + using d_type_list = nvbench::enum_type_list; +NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table)) + .set_name("parquet_read_wide_tables") + .set_min_samples(4) + .set_type_axes_names({"data_type"}) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + +NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed) + .set_name("parquet_read_wide_tables_mixed") + .set_min_samples(4) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + // a benchmark for structs that only contain fixed-width types using d_type_list_struct_only = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only)) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index 3abd4280081..7121cb9f034 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state) } std::tuple, size_t, size_t> write_file_data( - nvbench::state& state, std::vector const& d_types) + nvbench::state& state, std::vector const& d_types, io_type io_source_type) { cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); @@ -63,7 +63,7 @@ std::tuple, size_t, size_t> write_file_data( size_t total_file_size = 0; for (size_t i = 0; i < num_files; ++i) { - cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; + cuio_source_sink_pair source_sink{io_source_type}; auto const tbl = create_random_table( cycle_dtypes(d_types, num_cols), @@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, { size_t const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, auto const num_threads = state.get_int64("num_threads"); size_t const input_limit = state.get_int64("input_limit"); size_t const output_limit = state.get_int64("output_limit"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .set_name("parquet_multithreaded_read_decode_fixed_width") @@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .set_name("parquet_multithreaded_read_decode_string") @@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .set_name("parquet_multithreaded_read_decode_list") @@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); // mixed data types: fixed width, strings NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) @@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .set_name("parquet_multithreaded_read_decode_chunked_fixed_width") @@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .set_name("parquet_multithreaded_read_decode_chunked_string") @@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .set_name("parquet_multithreaded_read_decode_chunked_list") @@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index 2d514764fc2..62116ddf661 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -146,11 +147,15 @@ std::unique_ptr join_and_gather(cudf::table_view const& left_input, cudf::null_equality compare_nulls) { CUDF_FUNC_RANGE(); - constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; - auto const left_selected = left_input.select(left_on); - auto const right_selected = right_input.select(right_on); - auto const [left_join_indices, right_join_indices] = cudf::inner_join( - left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref()); + constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; + auto const left_selected = left_input.select(left_on); + auto const right_selected = right_input.select(right_on); + auto const [left_join_indices, right_join_indices] = + cudf::inner_join(left_selected, + right_selected, + compare_nulls, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); auto const left_indices_span = cudf::device_span{*left_join_indices}; auto const right_indices_span = cudf::device_span{*right_join_indices}; diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp new file mode 100644 index 00000000000..adc3dddc59c --- /dev/null +++ b/cpp/benchmarks/text/word_minhash.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +static void bench_word_minhash(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const base64 = state.get_int64("hash_type") == 64; + + data_profile const strings_profile = + data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); + auto strings_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + + auto const num_offsets = (num_rows / row_width) + 1; + auto offsets = cudf::sequence(num_offsets, + cudf::numeric_scalar(0), + cudf::numeric_scalar(row_width)); + + auto source = cudf::make_lists_column(num_offsets - 1, + std::move(offsets), + std::move(strings_table->release().front()), + 0, + rmm::device_buffer{}); + + data_profile const seeds_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), distribution_id::NORMAL, 0, 256); + auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); + auto seeds = seeds_table->get_column(0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); // output are hashes + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) + : nvtext::word_minhash(source->view(), seeds.view()); + }); +} + +NVBENCH_BENCH(bench_word_minhash) + .set_name("word_minhash") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) + .add_int64_axis("row_width", {10, 100, 1000}) + .add_int64_axis("seed_count", {2, 25}) + .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake index c0e07d02d94..90b0f4d8a8e 100644 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ b/cpp/cmake/thirdparty/get_spdlog.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,21 +16,12 @@ function(find_and_configure_spdlog) include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET cudf-exports) - rapids_export_package(BUILD spdlog cudf-exports) + rapids_cpm_spdlog( + FMT_OPTION "EXTERNAL_FMT_HO" + INSTALL_EXPORT_SET cudf-exports + BUILD_EXPORT_SET cudf-exports + ) - if(spdlog_ADDED) - rapids_export( - BUILD spdlog - EXPORT_SET spdlog - GLOBAL_TARGETS spdlog spdlog_header_only - NAMESPACE spdlog:: - ) - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports - ) - endif() endfunction() find_and_configure_spdlog() diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 8d206f245dc..6d1c91a5752 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions. **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen. +By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: +- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`) +- Line separator (Unicode: `2028`, UTF-8: `E280A8`) +- Next line (Unicode: `0085`, UTF-8: `C285`) +- Carriage return (Unicode: `000D`, UTF-8: `0D`) + **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following: - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals. - Unmatched paired special characters like `()`, `[]`, and `{}`. diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 442731694fa..9cda22d0695 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -18,6 +18,8 @@ #include "../utilities/timer.hpp" +#include + /** * @file parquet_io.cpp * @brief Demonstrates usage of the libcudf APIs to read and write @@ -159,8 +161,11 @@ int main(int argc, char const** argv) // Left anti-join the original and transcoded tables // identical tables should not throw an exception and // return an empty indices vector - auto const indices = cudf::left_anti_join( - input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get()); + auto const indices = cudf::left_anti_join(input->view(), + transcoded_input->view(), + cudf::null_equality::EQUAL, + cudf::get_default_stream(), + resource.get()); // No exception thrown, check indices auto const valid = indices->size() == 0; diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake index 44493011673..51613090534 100644 --- a/cpp/examples/versions.cmake +++ b/cpp/examples/versions.cmake @@ -12,4 +12,4 @@ # the License. # ============================================================================= -set(CUDF_TAG branch-24.10) +set(CUDF_TAG branch-24.12) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index c7523c80b2b..7359a0d5fde 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -17,9 +17,12 @@ #pragma once #include +#include #include #include +#include + #include /** @@ -40,6 +43,7 @@ namespace datetime { * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t years @@ -47,6 +51,7 @@ namespace datetime { */ std::unique_ptr extract_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -54,6 +59,7 @@ std::unique_ptr extract_year( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t months @@ -61,6 +67,7 @@ std::unique_ptr extract_year( */ std::unique_ptr extract_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -68,6 +75,7 @@ std::unique_ptr extract_month( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t days @@ -75,6 +83,7 @@ std::unique_ptr extract_month( */ std::unique_ptr extract_day( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -82,6 +91,7 @@ std::unique_ptr extract_day( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t days @@ -89,6 +99,7 @@ std::unique_ptr extract_day( */ std::unique_ptr extract_weekday( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -96,6 +107,7 @@ std::unique_ptr extract_weekday( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t hours @@ -103,6 +115,7 @@ std::unique_ptr extract_weekday( */ std::unique_ptr extract_hour( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -110,6 +123,7 @@ std::unique_ptr extract_hour( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t minutes @@ -117,6 +131,7 @@ std::unique_ptr extract_hour( */ std::unique_ptr extract_minute( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -124,6 +139,7 @@ std::unique_ptr extract_minute( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t seconds @@ -131,6 +147,7 @@ std::unique_ptr extract_minute( */ std::unique_ptr extract_second( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -141,6 +158,7 @@ std::unique_ptr extract_second( * For example, the millisecond fraction of 1.234567890 seconds is 234. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t milliseconds @@ -148,6 +166,7 @@ std::unique_ptr extract_second( */ std::unique_ptr extract_millisecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -158,6 +177,7 @@ std::unique_ptr extract_millisecond_fraction( * For example, the microsecond fraction of 1.234567890 seconds is 567. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t microseconds @@ -165,6 +185,7 @@ std::unique_ptr extract_millisecond_fraction( */ std::unique_ptr extract_microsecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -175,6 +196,7 @@ std::unique_ptr extract_microsecond_fraction( * For example, the nanosecond fraction of 1.234567890 seconds is 890. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t nanoseconds @@ -182,6 +204,7 @@ std::unique_ptr extract_microsecond_fraction( */ std::unique_ptr extract_nanosecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group @@ -196,6 +219,7 @@ std::unique_ptr extract_nanosecond_fraction( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS @@ -203,6 +227,7 @@ std::unique_ptr extract_nanosecond_fraction( */ std::unique_ptr last_day_of_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -210,6 +235,7 @@ std::unique_ptr last_day_of_month( * returns an int16_t cudf::column. The value is between [1, {365-366}] * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of datatype INT16 containing the day number since the start of the year @@ -217,6 +243,7 @@ std::unique_ptr last_day_of_month( */ std::unique_ptr day_of_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -245,6 +272,7 @@ std::unique_ptr day_of_year( * * @param timestamps cudf::column_view of timestamp type * @param months cudf::column_view of integer type containing the number of months to add + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of timestamp type containing the computed timestamps @@ -252,6 +280,7 @@ std::unique_ptr day_of_year( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::column_view const& months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -280,6 +309,7 @@ std::unique_ptr add_calendrical_months( * * @param timestamps cudf::column_view of timestamp type * @param months cudf::scalar of integer type containing the number of months to add + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @return cudf::column of timestamp type containing the computed timestamps @@ -287,6 +317,7 @@ std::unique_ptr add_calendrical_months( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::scalar const& months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -297,6 +328,7 @@ std::unique_ptr add_calendrical_months( * `output[i] is null` if `column[i]` is null * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of datatype BOOL8 truth value of the corresponding date @@ -304,6 +336,7 @@ std::unique_ptr add_calendrical_months( */ std::unique_ptr is_leap_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -315,11 +348,13 @@ std::unique_ptr is_leap_year( * @throw cudf::logic_error if input column datatype is not a TIMESTAMP * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * @return cudf::column of datatype INT16 of days in month of the corresponding date */ std::unique_ptr days_in_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -331,11 +366,13 @@ std::unique_ptr days_in_month( * @throw cudf::logic_error if input column datatype is not a TIMESTAMP * * @param column The input column containing datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * @return A column of INT16 type indicating which quarter the date is in */ std::unique_ptr extract_quarter( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -357,6 +394,7 @@ enum class rounding_frequency : int32_t { * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round up to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -365,6 +403,7 @@ enum class rounding_frequency : int32_t { std::unique_ptr ceil_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -372,6 +411,7 @@ std::unique_ptr ceil_datetimes( * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round down to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -380,6 +420,7 @@ std::unique_ptr ceil_datetimes( std::unique_ptr floor_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -387,6 +428,7 @@ std::unique_ptr floor_datetimes( * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -395,6 +437,7 @@ std::unique_ptr floor_datetimes( std::unique_ptr round_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index b257eef1e9e..4255faea702 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1497,8 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation); * * @tparam F Type of callable * @param k The `aggregation::Kind` value to dispatch - * aram f The callable that accepts an `aggregation::Kind` non-type template - * argument. + * @param f The callable that accepts an `aggregation::Kind` callable function object. * @param args Parameter pack forwarded to the `operator()` invocation * @return Forwards the return value of the callable. */ @@ -1626,6 +1625,7 @@ struct dispatch_source { * parameter of the callable `F` * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind` * non-type template parameter for the second template parameter of the callable + * @param f The callable that accepts `data_type` and `aggregation::Kind` function object. * @param args Parameter pack forwarded to the `operator()` invocation * `F`. */ @@ -1644,8 +1644,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d * @brief Returns the target `data_type` for the specified aggregation k * performed on elements of type source_type. * - * aram source_type The element type to be aggregated - * aram k The aggregation + * @param source_type The element type to be aggregated + * @param k The aggregation kind * @return data_type The target_type of k performed on source_type * elements */ diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 31782cbaf8a..9db7e48498f 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -26,111 +26,108 @@ namespace CUDF_EXPORT cudf { namespace datetime { namespace detail { /** - * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_year(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_month(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_month(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_day(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_day(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_weekday(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_hour(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_minute(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_second(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_second(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_millisecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_microsecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_nanosecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr last_day_of_month(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr day_of_year(cudf::column_view const& column, rmm::cuda_stream_view stream, @@ -138,9 +135,8 @@ std::unique_ptr day_of_year(cudf::column_view const& column, /** * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr add_calendrical_months(cudf::column_view const& timestamps, cudf::column_view const& months, @@ -149,9 +145,8 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti /** * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr add_calendrical_months(cudf::column_view const& timestamps, cudf::scalar const& months, @@ -159,9 +154,9 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti rmm::device_async_resource_ref mr); /** - * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr is_leap_year(cudf::column_view const& column, rmm::cuda_stream_view stream, diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp index 5738f9ec8e9..f51d1ba42b2 100644 --- a/cpp/include/cudf/detail/timezone.hpp +++ b/cpp/include/cudf/detail/timezone.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -26,14 +27,13 @@ namespace detail { /** * @copydoc cudf::make_timezone_transition_table(std::optional, std::string_view, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr make_timezone_transition_table( std::optional tzif_dir, std::string_view timezone_name, - rmm::cuda_stream_view stream, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace detail diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 73ff17b2b93..940d03cdb41 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -69,11 +69,21 @@ void normalize_single_quotes(datasource::owning_buffer& inda * @brief Normalize unquoted whitespace (space and tab characters) using FST * * @param indata Input device buffer + * @param col_offsets Offsets to column contents in input buffer + * @param col_lengths Length of contents of each row in column * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation + * + * @returns Tuple of the normalized column, offsets to each row in column, and lengths of contents + * of each row */ -void normalize_whitespace(datasource::owning_buffer& indata, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + normalize_whitespace(device_span d_input, + device_span col_offsets, + device_span col_lengths, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace io::json::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ed7b2ac0850..ee03a382bec 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -39,8 +39,9 @@ namespace io { * @file */ -constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; ///< 128MB per row group -constexpr size_type default_row_group_size_rows = 1000000; ///< 1 million rows per row group +constexpr size_t default_row_group_size_bytes = + std::numeric_limits::max(); ///< Infinite bytes per row group +constexpr size_type default_row_group_size_rows = 1'000'000; ///< 1 million rows per row group constexpr size_t default_max_page_size_bytes = 512 * 1024; ///< 512KB per page constexpr size_type default_max_page_size_rows = 20000; ///< 20k rows per page constexpr int32_t default_column_index_truncate_length = 64; ///< truncate to 64 bytes diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index cc8912cb022..a590eb27511 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -97,6 +97,7 @@ class distinct_hash_join; * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -108,6 +109,7 @@ std::pair>, inner_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -137,6 +139,7 @@ inner_join(cudf::table_view const& left_keys, * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -148,6 +151,7 @@ std::pair>, left_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -176,6 +180,7 @@ left_join(cudf::table_view const& left_keys, * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -187,6 +192,7 @@ std::pair>, full_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -205,6 +211,7 @@ full_join(cudf::table_view const& left_keys, * @param left_keys The left table * @param right_keys The right table * @param compare_nulls Controls whether null join-key values should match or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct @@ -215,6 +222,7 @@ std::unique_ptr> left_semi_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -236,6 +244,7 @@ std::unique_ptr> left_semi_join( * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A column `left_indices` that can be used to construct @@ -246,6 +255,7 @@ std::unique_ptr> left_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -266,6 +276,7 @@ std::unique_ptr> left_anti_join( * * @param left The left table * @param right The right table + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @return Result of cross joining `left` and `right` tables @@ -273,6 +284,7 @@ std::unique_ptr> left_anti_join( std::unique_ptr cross_join( cudf::table_view const& left, cudf::table_view const& right, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -567,6 +579,7 @@ class distinct_hash_join { * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -578,6 +591,7 @@ conditional_inner_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -612,6 +626,7 @@ conditional_inner_join(table_view const& left, * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -623,6 +638,7 @@ conditional_left_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -655,6 +671,7 @@ conditional_left_join(table_view const& left, * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -665,6 +682,7 @@ std::pair>, conditional_full_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -693,6 +711,7 @@ conditional_full_join(table_view const& left, * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct the result of @@ -704,6 +723,7 @@ std::unique_ptr> conditional_left_semi_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -732,6 +752,7 @@ std::unique_ptr> conditional_left_semi_join( * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct the result of @@ -743,6 +764,7 @@ std::unique_ptr> conditional_left_anti_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -786,6 +808,7 @@ std::unique_ptr> conditional_left_anti_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_inner_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -801,6 +824,7 @@ mixed_inner_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -846,6 +870,7 @@ mixed_inner_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_left_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -861,6 +886,7 @@ mixed_left_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -906,6 +932,7 @@ mixed_left_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_full_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -921,6 +948,7 @@ mixed_full_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -956,6 +984,7 @@ mixed_full_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -968,6 +997,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1004,6 +1034,7 @@ std::unique_ptr> mixed_left_semi_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -1016,6 +1047,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1041,6 +1073,7 @@ std::unique_ptr> mixed_left_anti_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair containing the size that would result from performing the @@ -1056,6 +1089,7 @@ std::pair>> mixed_in table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1081,6 +1115,7 @@ std::pair>> mixed_in * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair containing the size that would result from performing the @@ -1096,6 +1131,7 @@ std::pair>> mixed_le table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1111,6 +1147,7 @@ std::pair>> mixed_le * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1119,6 +1156,7 @@ std::size_t conditional_inner_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1134,6 +1172,7 @@ std::size_t conditional_inner_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1142,6 +1181,7 @@ std::size_t conditional_left_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1157,6 +1197,7 @@ std::size_t conditional_left_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1165,6 +1206,7 @@ std::size_t conditional_left_semi_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1180,6 +1222,7 @@ std::size_t conditional_left_semi_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1188,6 +1231,7 @@ std::size_t conditional_left_anti_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index f7108129dee..4f3fc7086f2 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -35,10 +35,11 @@ namespace strings { * and to match the Python flag values. */ enum regex_flags : uint32_t { - DEFAULT = 0, ///< default - MULTILINE = 8, ///< the '^' and '$' honor new-line characters - DOTALL = 16, ///< the '.' matching includes new-line characters - ASCII = 256 ///< use only ASCII when matching built-in character classes + DEFAULT = 0, ///< default + MULTILINE = 8, ///< the '^' and '$' honor new-line characters + DOTALL = 16, ///< the '.' matching includes new-line characters + ASCII = 256, ///< use only ASCII when matching built-in character classes + EXT_NEWLINE = 512 ///< new-line matches extended characters }; /** @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f) return (f & regex_flags::ASCII) == regex_flags::ASCII; } +/** + * @brief Returns true if the given flags contain EXT_NEWLINE + * + * @param f Regex flags to check + * @return true if `f` includes EXT_NEWLINE + */ +constexpr bool is_ext_newline(regex_flags const f) +{ + return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE; +} + /** * @brief Capture groups setting * diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index abb26d7ccb4..14695c3bb27 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper __device__ inline string_view::const_iterator& string_view::const_iterator::operator--() { - if (byte_pos > 0) - while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) - ; + if (byte_pos > 0) { + if (byte_pos == char_pos) { + --byte_pos; + } else { + while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) + ; + } + } --char_pos; return *this; } diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index aa903770e26..f6de1056c24 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -15,9 +15,12 @@ */ #pragma once +#include #include #include +#include + #include #include #include @@ -43,6 +46,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; * * @param tzif_dir The directory where the TZif files are located * @param timezone_name standard timezone name (for example, "America/Los_Angeles") + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory. * * @return The transition table for the given timezone @@ -50,6 +54,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; std::unique_ptr
make_timezone_transition_table( std::optional tzif_dir, std::string_view timezone_name, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index c83a4260c19..7c909f1a948 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -72,7 +73,7 @@ std::unique_ptr minhash( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -133,7 +134,7 @@ std::unique_ptr minhash64( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -150,5 +151,61 @@ std::unique_ptr minhash64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x64_128 for the hash algorithm though + * only the first 64-bits of the hash are used in computing the output. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash64( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index fd9a6b8f5fe..ddb0dbcd96d 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -580,142 +580,167 @@ std::unique_ptr extract_quarter(column_view const& column, std::unique_ptr ceil_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::CEIL, freq, column, stream, mr); } std::unique_ptr floor_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::FLOOR, freq, column, stream, mr); } std::unique_ptr round_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::ROUND, freq, column, stream, mr); } -std::unique_ptr extract_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_year(column, cudf::get_default_stream(), mr); + return detail::extract_year(column, stream, mr); } -std::unique_ptr extract_month(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_month(column, cudf::get_default_stream(), mr); + return detail::extract_month(column, stream, mr); } -std::unique_ptr extract_day(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_day(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_day(column, cudf::get_default_stream(), mr); + return detail::extract_day(column, stream, mr); } std::unique_ptr extract_weekday(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_weekday(column, cudf::get_default_stream(), mr); + return detail::extract_weekday(column, stream, mr); } -std::unique_ptr extract_hour(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_hour(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_hour(column, cudf::get_default_stream(), mr); + return detail::extract_hour(column, stream, mr); } -std::unique_ptr extract_minute(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_minute(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_minute(column, cudf::get_default_stream(), mr); + return detail::extract_minute(column, stream, mr); } -std::unique_ptr extract_second(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_second(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_second(column, cudf::get_default_stream(), mr); + return detail::extract_second(column, stream, mr); } std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_millisecond_fraction(column, stream, mr); } std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_microsecond_fraction(column, stream, mr); } std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_nanosecond_fraction(column, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::last_day_of_month(column, cudf::get_default_stream(), mr); + return detail::last_day_of_month(column, stream, mr); } -std::unique_ptr day_of_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr day_of_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::day_of_year(column, cudf::get_default_stream(), mr); + return detail::day_of_year(column, stream, mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, cudf::column_view const& months_column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months( - timestamp_column, months_column, cudf::get_default_stream(), mr); + return detail::add_calendrical_months(timestamp_column, months_column, stream, mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, cudf::scalar const& months, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr); + return detail::add_calendrical_months(timestamp_column, months, stream, mr); } -std::unique_ptr is_leap_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr is_leap_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::is_leap_year(column, cudf::get_default_stream(), mr); + return detail::is_leap_year(column, stream, mr); } -std::unique_ptr days_in_month(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr days_in_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::days_in_month(column, cudf::get_default_stream(), mr); + return detail::days_in_month(column, stream, mr); } std::unique_ptr extract_quarter(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_quarter(column, cudf::get_default_stream(), mr); + return detail::extract_quarter(column, stream, mr); } } // namespace datetime diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 6498a5e6c55..cf239297255 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -380,11 +380,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, std::string_view timezone_name, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::make_timezone_transition_table( - tzif_dir, timezone_name, cudf::get_default_stream(), mr); + return detail::make_timezone_transition_table(tzif_dir, timezone_name, stream, mr); } namespace detail { diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu new file mode 100644 index 00000000000..70d61132b42 --- /dev/null +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -0,0 +1,808 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/utilities/parsing_utils.cuh" +#include "io/utilities/string_parsing.hpp" +#include "nested_json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::io::json::detail { + +/** + * @brief Get the column indices for the values column for array of arrays rows + * + * @param row_array_children_level The level of the row array's children + * @param d_tree The tree metadata + * @param col_ids The column ids + * @param num_columns The number of columns + * @param stream The stream to use + * @return The value columns' indices + */ +rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, + tree_meta_t const& d_tree, + device_span col_ids, + size_type const num_columns, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto [level2_nodes, level2_indices] = get_array_children_indices( + row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); + auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); + rmm::device_uvector values_column_indices(num_columns, stream); + thrust::scatter(rmm::exec_policy(stream), + level2_indices.begin(), + level2_indices.end(), + col_id_location, + values_column_indices.begin()); + return values_column_indices; +} + +/** + * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. + * + * @param input String device buffer + * @param node_range_begin Begin offset of the strings + * @param node_range_end End offset of the strings + * @param stream CUDA stream + * @return Vector of strings + */ +std::vector copy_strings_to_host_sync( + device_span input, + device_span node_range_begin, + device_span node_range_end, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto const num_strings = node_range_begin.size(); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); + auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); + thrust::transform(rmm::exec_policy(stream), + d_offset_pairs, + d_offset_pairs + num_strings, + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { + // Note: first character for non-field columns + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), + static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); + }); + + cudf::io::parse_options_view options_view{}; + options_view.quotechar = '\0'; // no quotes + options_view.keepquotes = true; + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, + num_strings, + data_type{type_id::STRING}, + rmm::device_buffer{}, + 0, + options_view, + stream, + cudf::get_current_device_resource_ref()); + auto to_host = [stream](auto const& col) { + if (col.is_empty()) return std::vector{}; + auto const scv = cudf::strings_column_view(col); + auto const h_chars = cudf::detail::make_host_vector_async( + cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); + auto const h_offsets = cudf::detail::make_host_vector_async( + cudf::device_span(scv.offsets().data() + scv.offset(), + scv.size() + 1), + stream); + stream.synchronize(); + + // build std::string vector from chars and offsets + std::vector host_data; + host_data.reserve(col.size()); + std::transform( + std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + return host_data; + }; + return to_host(d_column_names->view()); +} + +/** + * @brief Checks if all strings in each string column in the tree are nulls. + * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as + * false. + * + * @param input Input JSON string device data + * @param d_column_tree column tree representation of JSON string + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param options Parsing options specifying the parsing behaviour + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Array of bytes where each byte indicate if it is all nulls string column. + */ +rmm::device_uvector is_all_nulls_each_column(device_span input, + tree_meta_t const& d_column_tree, + tree_meta_t const& tree, + device_span col_ids, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_cols = d_column_tree.node_categories.size(); + rmm::device_uvector is_all_nulls(num_cols, stream); + thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); + + auto parse_opt = parsing_options(options, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; + } + }); + return is_all_nulls; +} + +NodeIndexT get_row_array_parent_col_id(device_span col_ids, + bool is_enabled_lines, + rmm::cuda_stream_view stream) +{ + NodeIndexT value = parent_node_sentinel; + if (!col_ids.empty()) { + auto const list_node_index = is_enabled_lines ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + col_ids.data() + list_node_index, + sizeof(NodeIndexT), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + } + return value; +} +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + +std::pair, + std::unordered_map>> +build_tree(device_json_column& root, + std::vector const& is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +void scatter_offsets( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t& d_column_tree, + host_span ignore_vals, + std::unordered_map>& columns, + rmm::cuda_stream_view stream); + +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto const num_nodes = col_ids.size(); + rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy + thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(col_ids.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + NodeIndexT const row_array_parent_col_id = + get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); + + // 1. gather column information. + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + reduce_to_column_tree(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + auto num_columns = d_unique_col_ids.size(); + std::vector column_names = copy_strings_to_host_sync( + input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + // array of arrays column names + if (is_array_of_arrays) { + auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; + auto values_column_indices = + get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); + auto h_values_column_indices = + cudf::detail::make_host_vector_sync(values_column_indices, stream); + std::transform(unique_col_ids.begin(), + unique_col_ids.end(), + column_names.begin(), + column_names.begin(), + [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( + auto col_id, auto name) mutable { + return column_parent_ids[col_id] == row_array_parent_col_id + ? std::to_string(h_values_column_indices[col_id]) + : name; + }); + } + + auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { + if (is_enabled_mixed_types_as_string) { + return cudf::detail::make_std_vector_sync( + is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); + } + return std::vector(); + }(); + auto [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); + + scatter_offsets(tree, + col_ids, + row_offsets, + node_ids, + sorted_col_ids, + d_column_tree, + ignore_vals, + columns, + stream); +} + +std::pair, + std::unordered_map>> +build_tree(device_json_column& root, + std::vector const& is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto column_categories = + cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + auto column_range_beg = + cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); + auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); + auto num_columns = d_unique_col_ids.size(); + + auto to_json_col_type = [](auto category) { + switch (category) { + case NC_STRUCT: return json_col_t::StructColumn; + case NC_LIST: return json_col_t::ListColumn; + case NC_STR: [[fallthrough]]; + case NC_VAL: return json_col_t::StringColumn; + default: return json_col_t::Unknown; + } + }; + auto init_to_zero = [stream](auto& v) { + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); + }; + + auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { + if (column_category == NC_ERR || column_category == NC_FN) { + return; + } else if (column_category == NC_VAL || column_category == NC_STR) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + } else if (column_category == NC_LIST) { + col.child_offsets.resize(max_row_offsets[i] + 2, stream); + init_to_zero(col.child_offsets); + } + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = to_json_col_type(column_category); + }; + + auto reinitialize_as_string = [&](auto i, auto& col) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = json_col_t::StringColumn; + // destroy references of all child columns after this step, by calling remove_child_columns + }; + + path_from_tree tree_path{column_categories, + column_parent_ids, + column_names, + is_array_of_arrays, + row_array_parent_col_id}; + + // 2. generate nested columns tree and its device_memory + // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. + auto h_range_col_id_it = + thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<0>(a) < thrust::get<0>(b); + }); + + // use hash map because we may skip field name's col_ids + std::unordered_map> columns; + // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking + std::map, NodeIndexT> mapped_columns; + // find column_ids which are values, but should be ignored in validity + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + std::vector is_mixed_type_column(num_columns, 0); + std::vector is_pruned(num_columns, 0); + // for columns that are not mixed type but have been forced as string + std::vector forced_as_string_column(num_columns); + columns.try_emplace(parent_node_sentinel, std::ref(root)); + + std::function remove_child_columns = + [&](NodeIndexT this_col_id, device_json_column& col) { + for (auto col_name : col.column_order) { + auto child_id = mapped_columns[{this_col_id, col_name}]; + is_mixed_type_column[child_id] = 1; + remove_child_columns(child_id, col.child_columns.at(col_name)); + mapped_columns.erase({this_col_id, col_name}); + columns.erase(child_id); + } + col.child_columns.clear(); // their references are deleted above. + col.column_order.clear(); + }; + + auto name_and_parent_index = [&is_array_of_arrays, + &row_array_parent_col_id, + &column_parent_ids, + &column_categories, + &column_names](auto this_col_id) { + std::string name = ""; + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { + if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { + name = column_names[this_col_id]; + } else { + name = list_child_name; + } + } else if (column_categories[parent_col_id] == NC_FN) { + auto field_name_col_id = parent_col_id; + parent_col_id = column_parent_ids[parent_col_id]; + name = column_names[field_name_col_id]; + } else { + CUDF_FAIL("Unexpected parent column category"); + } + return std::pair{name, parent_col_id}; + }; + + // Prune columns that are not required to be parsed. + if (options.is_enabled_prune_columns()) { + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + // get path of this column, and get its dtype if present in options + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { + is_pruned[this_col_id] = 1; + continue; + } else { + // make sure all its parents are not pruned. + while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { + is_pruned[parent_col_id] = 0; + parent_col_id = column_parent_ids[parent_col_id]; + } + } + } + } + + // Build the column tree, also, handles mixed types. + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + + // if parent is mixed type column or this column is pruned or if parent + // has been forced as string, ignore this column. + if (parent_col_id != parent_node_sentinel && + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || + forced_as_string_column[parent_col_id]) { + ignore_vals[this_col_id] = 1; + if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } + if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } + continue; + } + + // If the child is already found, + // replace if this column is a nested column and the existing was a value column + // ignore this column if this column is a value column and the existing was a nested column + auto it = columns.find(parent_col_id); + CUDF_EXPECTS(it != columns.end(), "Parent column not found"); + auto& parent_col = it->second.get(); + bool replaced = false; + if (mapped_columns.count({parent_col_id, name}) > 0) { + auto const old_col_id = mapped_columns[{parent_col_id, name}]; + // If mixed type as string is enabled, make both of them strings and merge them. + // All child columns will be ignored when parsing. + if (is_enabled_mixed_types_as_string) { + bool const is_mixed_type = [&]() { + // If new or old is STR and they are all not null, make it mixed type, else ignore. + if (column_categories[this_col_id] == NC_VAL || + column_categories[this_col_id] == NC_STR) { + if (is_str_column_all_nulls[this_col_id]) return false; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + if (is_str_column_all_nulls[old_col_id]) return false; + } + return true; + }(); + if (is_mixed_type) { + is_mixed_type_column[this_col_id] = 1; + is_mixed_type_column[old_col_id] = 1; + // if old col type (not cat) is list or struct, replace with string. + auto& col = columns.at(old_col_id).get(); + if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { + reinitialize_as_string(old_col_id, col); + remove_child_columns(old_col_id, col); + // all its children (which are already inserted) are ignored later. + } + col.forced_as_string_column = true; + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; + } + } + + if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { + ignore_vals[this_col_id] = 1; + continue; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + // remap + ignore_vals[old_col_id] = 1; + mapped_columns.erase({parent_col_id, name}); + columns.erase(old_col_id); + parent_col.child_columns.erase(name); + replaced = true; // to skip duplicate name in column_order + } else { + // If this is a nested column but we're trying to insert either (a) a list node into a + // struct column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and + column_categories[this_col_id] == NC_STRUCT) or + (column_categories[old_col_id] == NC_STRUCT and + column_categories[this_col_id] == NC_LIST)), + "A mix of lists and structs within the same column is not supported"); + } + } + + auto this_column_category = column_categories[this_col_id]; + // get path of this column, check if it is a struct/list forced as string, and enforce it + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + this_column_category = NC_STR; + } + + CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); + // move into parent + device_json_column col(stream, mr); + initialize_json_columns(this_col_id, col, this_column_category); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + col.forced_as_string_column = true; + forced_as_string_column[this_col_id] = true; + } + + auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; + CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); + if (not replaced) parent_col.column_order.push_back(name); + columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); + mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); + } + + if (is_enabled_mixed_types_as_string) { + // ignore all children of mixed type columns + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + columns.erase(this_col_id); + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and + is_mixed_type_column[this_col_id] == 1) + column_categories[this_col_id] = NC_STR; + } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + } + + // ignore all children of columns forced as string + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { + forced_as_string_column[this_col_id] = true; + ignore_vals[this_col_id] = 1; + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and + forced_as_string_column[this_col_id]) + column_categories[this_col_id] = NC_STR; + } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + + // restore unique_col_ids order + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<1>(a) < thrust::get<1>(b); + }); + return {ignore_vals, columns}; +} + +void scatter_offsets( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t& d_column_tree, + host_span ignore_vals, + std::unordered_map>& columns, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_columns = d_column_tree.node_categories.size(); + // move columns data to device. + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + static_cast(col.validity.data())}; + } + + auto d_ignore_vals = cudf::detail::make_device_uvector_async( + ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_columns_data = cudf::detail::make_device_uvector_async( + columns_data, stream, cudf::get_current_device_resource_ref()); + + // 3. scatter string offsets to respective columns, set validity bits + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_STR: [[fallthrough]]; + case NC_VAL: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + })); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } + stream.synchronize(); +} + +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 8890c786287..b08fd139113 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -35,23 +35,16 @@ #include #include -#include #include #include #include -#include #include #include #include #include -#include -#include #include #include -#include -#include - namespace cudf::io::json::detail { // DEBUG prints @@ -296,651 +289,6 @@ reduce_to_column_tree(tree_meta_t& tree, std::move(max_row_offsets)}; } -/** - * @brief Get the column indices for the values column for array of arrays rows - * - * @param row_array_children_level The level of the row array's children - * @param d_tree The tree metadata - * @param col_ids The column ids - * @param num_columns The number of columns - * @param stream The stream to use - * @return The value columns' indices - */ -rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, - tree_meta_t const& d_tree, - device_span col_ids, - size_type const num_columns, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto [level2_nodes, level2_indices] = get_array_children_indices( - row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); - auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); - rmm::device_uvector values_column_indices(num_columns, stream); - thrust::scatter(rmm::exec_policy(stream), - level2_indices.begin(), - level2_indices.end(), - col_id_location, - values_column_indices.begin()); - return values_column_indices; -} - -/** - * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. - * - * @param input String device buffer - * @param node_range_begin Begin offset of the strings - * @param node_range_end End offset of the strings - * @param stream CUDA stream - * @return Vector of strings - */ -std::vector copy_strings_to_host_sync( - device_span input, - device_span node_range_begin, - device_span node_range_end, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto const num_strings = node_range_begin.size(); - rmm::device_uvector string_offsets(num_strings, stream); - rmm::device_uvector string_lengths(num_strings, stream); - auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); - thrust::transform(rmm::exec_policy(stream), - d_offset_pairs, - d_offset_pairs + num_strings, - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), - [] __device__(auto const& offsets) { - // Note: first character for non-field columns - return thrust::make_tuple( - static_cast(thrust::get<0>(offsets)), - static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); - }); - - cudf::io::parse_options_view options_view{}; - options_view.quotechar = '\0'; // no quotes - options_view.keepquotes = true; - auto d_offset_length_it = - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); - auto d_column_names = parse_data(input.data(), - d_offset_length_it, - num_strings, - data_type{type_id::STRING}, - rmm::device_buffer{}, - 0, - options_view, - stream, - cudf::get_current_device_resource_ref()); - auto to_host = [stream](auto const& col) { - if (col.is_empty()) return std::vector{}; - auto const scv = cudf::strings_column_view(col); - auto const h_chars = cudf::detail::make_host_vector_async( - cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); - auto const h_offsets = cudf::detail::make_host_vector_async( - cudf::device_span(scv.offsets().data() + scv.offset(), - scv.size() + 1), - stream); - stream.synchronize(); - - // build std::string vector from chars and offsets - std::vector host_data; - host_data.reserve(col.size()); - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - std::back_inserter(host_data), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); - return host_data; - }; - return to_host(d_column_names->view()); -} - -/** - * @brief Checks if all strings in each string column in the tree are nulls. - * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as - * false. - * - * @param input Input JSON string device data - * @param d_column_tree column tree representation of JSON string - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param options Parsing options specifying the parsing behaviour - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Array of bytes where each byte indicate if it is all nulls string column. - */ -rmm::device_uvector is_all_nulls_each_column(device_span input, - tree_meta_t const& d_column_tree, - tree_meta_t const& tree, - device_span col_ids, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream) -{ - auto const num_nodes = col_ids.size(); - auto const num_cols = d_column_tree.node_categories.size(); - rmm::device_uvector is_all_nulls(num_cols, stream); - thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); - - auto parse_opt = parsing_options(options, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [options = parse_opt.view(), - data = input.data(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { - auto const node_category = column_categories[col_ids[i]]; - if (node_category == NC_STR or node_category == NC_VAL) { - auto const is_null_literal = serialized_trie_contains( - options.trie_na, - {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); - if (!is_null_literal) is_all_nulls[col_ids[i]] = false; - } - }); - return is_all_nulls; -} - -/** - * @brief Holds member data pointers of `d_json_column` - * - */ -struct json_column_data { - using row_offset_t = json_column::row_offset_t; - row_offset_t* string_offsets; - row_offset_t* string_lengths; - row_offset_t* child_offsets; - bitmask_type* validity; -}; - -/** - * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are insert into `root`'s children. - * `root` must be a list type. - * - * @param input Input JSON string device data - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param row_offsets Row offsets of the nodes in the tree - * @param root Root node of the `d_json_column` tree - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param options Parsing options specifying the parsing behaviour - * options affecting behaviour are - * is_enabled_lines: Whether the input is a line-delimited JSON - * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the device memory - * of child_offets and validity members of `d_json_column` - */ -void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - bool const is_enabled_lines = options.is_enabled_lines(); - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); - - // sort by {col_id} on {node_ids} stable - rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key( - rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); - - NodeIndexT const row_array_parent_col_id = [&]() { - NodeIndexT value = parent_node_sentinel; - if (!col_ids.empty()) { - auto const list_node_index = is_enabled_lines ? 0 : 1; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } - return value; - }(); - - // 1. gather column information. - auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = - reduce_to_column_tree(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); - auto num_columns = d_unique_col_ids.size(); - auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto column_categories = - cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - auto column_range_beg = - cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); - auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); - std::vector column_names = copy_strings_to_host_sync( - input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - // array of arrays column names - if (is_array_of_arrays) { - TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; - auto values_column_indices = - get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); - auto h_values_column_indices = - cudf::detail::make_host_vector_sync(values_column_indices, stream); - std::transform(unique_col_ids.begin(), - unique_col_ids.end(), - column_names.begin(), - column_names.begin(), - [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( - auto col_id, auto name) mutable { - return column_parent_ids[col_id] == row_array_parent_col_id - ? std::to_string(h_values_column_indices[col_id]) - : name; - }); - } - - auto to_json_col_type = [](auto category) { - switch (category) { - case NC_STRUCT: return json_col_t::StructColumn; - case NC_LIST: return json_col_t::ListColumn; - case NC_STR: [[fallthrough]]; - case NC_VAL: return json_col_t::StringColumn; - default: return json_col_t::Unknown; - } - }; - auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); - }; - - auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { - if (column_category == NC_ERR || column_category == NC_FN) { - return; - } else if (column_category == NC_VAL || column_category == NC_STR) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - } else if (column_category == NC_LIST) { - col.child_offsets.resize(max_row_offsets[i] + 2, stream); - init_to_zero(col.child_offsets); - } - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_category); - }; - - auto reinitialize_as_string = [&](auto i, auto& col) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = json_col_t::StringColumn; - // destroy references of all child columns after this step, by calling remove_child_columns - }; - - path_from_tree tree_path{column_categories, - column_parent_ids, - column_names, - is_array_of_arrays, - row_array_parent_col_id}; - - // 2. generate nested columns tree and its device_memory - // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. - auto h_range_col_id_it = - thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<0>(a) < thrust::get<0>(b); - }); - - auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { - if (is_enabled_mixed_types_as_string) { - return cudf::detail::make_host_vector_sync( - is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); - } - return cudf::detail::make_empty_host_vector(0, stream); - }(); - - // use hash map because we may skip field name's col_ids - std::unordered_map> columns; - // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking - std::map, NodeIndexT> mapped_columns; - // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); - std::vector is_mixed_type_column(num_columns, 0); - std::vector is_pruned(num_columns, 0); - columns.try_emplace(parent_node_sentinel, std::ref(root)); - - std::function remove_child_columns = - [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto col_name : col.column_order) { - auto child_id = mapped_columns[{this_col_id, col_name}]; - is_mixed_type_column[child_id] = 1; - remove_child_columns(child_id, col.child_columns.at(col_name)); - mapped_columns.erase({this_col_id, col_name}); - columns.erase(child_id); - } - col.child_columns.clear(); // their references are deleted above. - col.column_order.clear(); - }; - - auto name_and_parent_index = [&is_array_of_arrays, - &row_array_parent_col_id, - &column_parent_ids, - &column_categories, - &column_names](auto this_col_id) { - std::string name = ""; - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } else { - CUDF_FAIL("Unexpected parent column category"); - } - return std::pair{name, parent_col_id}; - }; - - // Prune columns that are not required to be parsed. - if (options.is_enabled_prune_columns()) { - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // get path of this column, and get its dtype if present in options - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { - is_pruned[this_col_id] = 1; - continue; - } else { - // make sure all its parents are not pruned. - while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { - is_pruned[parent_col_id] = 0; - parent_col_id = column_parent_ids[parent_col_id]; - } - } - } - } - - // Build the column tree, also, handles mixed types. - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - - // if parent is mixed type column or this column is pruned, ignore this column. - if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) { - ignore_vals[this_col_id] = 1; - if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } - continue; - } - - // If the child is already found, - // replace if this column is a nested column and the existing was a value column - // ignore this column if this column is a value column and the existing was a nested column - auto it = columns.find(parent_col_id); - CUDF_EXPECTS(it != columns.end(), "Parent column not found"); - auto& parent_col = it->second.get(); - bool replaced = false; - if (mapped_columns.count({parent_col_id, name}) > 0) { - auto const old_col_id = mapped_columns[{parent_col_id, name}]; - // If mixed type as string is enabled, make both of them strings and merge them. - // All child columns will be ignored when parsing. - if (is_enabled_mixed_types_as_string) { - bool const is_mixed_type = [&]() { - // If new or old is STR and they are all not null, make it mixed type, else ignore. - if (column_categories[this_col_id] == NC_VAL || - column_categories[this_col_id] == NC_STR) { - if (is_str_column_all_nulls[this_col_id]) return false; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - if (is_str_column_all_nulls[old_col_id]) return false; - } - return true; - }(); - if (is_mixed_type) { - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - remove_child_columns(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } - col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } - } - - if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = 1; - continue; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - // remap - ignore_vals[old_col_id] = 1; - mapped_columns.erase({parent_col_id, name}); - columns.erase(old_col_id); - parent_col.child_columns.erase(name); - replaced = true; // to skip duplicate name in column_order - } else { - // If this is a nested column but we're trying to insert either (a) a list node into a - // struct column or (b) a struct node into a list column, we fail - CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and - column_categories[this_col_id] == NC_STRUCT) or - (column_categories[old_col_id] == NC_STRUCT and - column_categories[this_col_id] == NC_LIST)), - "A mix of lists and structs within the same column is not supported"); - } - } - - auto this_column_category = column_categories[this_col_id]; - if (is_enabled_mixed_types_as_string) { - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - is_mixed_type_column[this_col_id] = 1; - this_column_category = NC_STR; - } - } - - CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); - // move into parent - device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col, this_column_category); - auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; - CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); - if (not replaced) parent_col.column_order.push_back(name); - columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); - mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); - } - - if (is_enabled_mixed_types_as_string) { - // ignore all children of mixed type columns - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; - columns.erase(this_col_id); - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and - is_mixed_type_column[this_col_id] == 1) - column_categories[this_col_id] = NC_STR; - } - cudaMemcpyAsync(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudaMemcpyDefault, - stream.value()); - } - - // restore unique_col_ids order - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<1>(a) < thrust::get<1>(b); - }); - // move columns data to device. - auto columns_data = cudf::detail::make_host_vector(num_columns, stream); - for (auto& [col_id, col_ref] : columns) { - if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - columns_data[col_id] = json_column_data{col.string_offsets.data(), - col.string_lengths.data(), - col.child_offsets.data(), - static_cast(col.validity.data())}; - } - - auto d_ignore_vals = cudf::detail::make_device_uvector_async( - ignore_vals, stream, cudf::get_current_device_resource_ref()); - auto d_columns_data = cudf::detail::make_device_uvector_async( - columns_data, stream, cudf::get_current_device_resource_ref()); - - // 3. scatter string offsets to respective columns, set validity bits - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - if (d_ignore_vals[col_ids[i]]) return; - auto const node_category = column_categories[col_ids[i]]; - switch (node_category) { - case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_STR: [[fallthrough]]; - case NC_VAL: - if (d_ignore_vals[col_ids[i]]) break; - set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); - d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; - d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; - break; - default: break; - } - }); - - // 4. scatter List offset - // copy_if only node's whose parent is list, (node_id, parent_col_id) - // stable_sort by parent_col_id of {node_id}. - // For all unique parent_node_id of (i==0, i-1!=i), write start offset. - // (i==last, i+1!=i), write end offset. - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids - auto parent_col_id = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [col_ids = col_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { - return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_ids[node_id]]; - })); - auto const list_children_end = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + - num_nodes, - thrust::make_counting_iterator(0), - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST and - (!d_ignore_vals[col_ids[parent_node_id]]); - }); - - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy(stream), - parent_col_ids.begin(), - parent_col_ids.begin() + num_list_children, - node_ids.begin()); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_list_children, - [node_ids = node_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - parent_col_ids = parent_col_ids.begin(), - row_offsets = row_offsets.begin(), - d_columns_data = d_columns_data.begin(), - num_list_children] __device__(size_type i) { - auto const node_id = node_ids[i]; - auto const parent_node_id = parent_node_ids[node_id]; - // scatter to list_offset - if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = - row_offsets[node_id]; - } - // last value of list child_offset is its size. - if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = - row_offsets[node_id] + 1; - } - }); - - // 5. scan on offsets. - for (auto& [id, col_ref] : columns) { - auto& col = col_ref.get(); - if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.string_offsets.begin(), - col.string_offsets.end(), - col.string_offsets.begin(), - thrust::maximum{}); - } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.child_offsets.begin(), - col.child_offsets.end(), - col.child_offsets.begin(), - thrust::maximum{}); - } - } - stream.synchronize(); -} - std::pair, std::vector> device_json_column_to_cudf_column( device_json_column& json_col, device_span d_input, @@ -982,39 +330,58 @@ std::pair, std::vector> device_json_co "string offset, string length mismatch"); rmm::device_uvector d_string_data(col_size, stream); // TODO how about directly storing pair in json_column? - auto offset_length_it = - thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - data_type target_type{}; + auto [result_bitmask, null_count] = make_validity(json_col); - if (schema.has_value()) { + data_type target_type{}; + std::unique_ptr col{}; + if (options.normalize_whitespace && json_col.forced_as_string_column) { + CUDF_EXPECTS(prune_columns || options.mixed_types_as_string, + "Whitespace normalization of nested columns requested as string requires " + "either prune_columns or mixed_types_as_string to be enabled"); + auto [normalized_d_input, col_offsets, col_lengths] = + cudf::io::json::detail::normalize_whitespace( + d_input, json_col.string_offsets, json_col.string_lengths, stream, mr); + auto offset_length_it = thrust::make_zip_iterator(col_offsets.begin(), col_lengths.begin()); + target_type = data_type{type_id::STRING}; + // Convert strings to the inferred data type + col = parse_data(normalized_d_input.data(), + offset_length_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); + } else { + auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), + json_col.string_lengths.begin()); + if (schema.has_value()) { #ifdef NJP_DEBUG_PRINT - std::cout << "-> explicit type: " - << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) - : "n/a"); + std::cout << "-> explicit type: " + << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) + : "n/a"); #endif - target_type = schema.value().type; - } else if (json_col.forced_as_string_column) { - target_type = data_type{type_id::STRING}; - } - // Infer column type, if we don't have an explicit type for it - else { - target_type = cudf::io::detail::infer_data_type( - options.json_view(), d_input, offset_length_it, col_size, stream); + target_type = schema.value().type; + } + // Infer column type, if we don't have an explicit type for it + else { + target_type = cudf::io::detail::infer_data_type( + options.json_view(), d_input, offset_length_it, col_size, stream); + } + // Convert strings to the inferred data type + col = parse_data(d_input.data(), + offset_length_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); } - auto [result_bitmask, null_count] = make_validity(json_col); - // Convert strings to the inferred data type - auto col = parse_data(d_input.data(), - offset_length_it, - col_size, - target_type, - std::move(result_bitmask), - null_count, - options.view(), - stream, - mr); - // Reset nullable if we do not have nulls // This is to match the existing JSON reader's behaviour: // - Non-string columns will always be returned as nullable @@ -1120,11 +487,15 @@ table_with_metadata device_parse_nested_json(device_span d_input, const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref()); // gpu tree generation - return get_tree_representation(tokens_gpu, - token_indices_gpu, - options.is_enabled_mixed_types_as_string(), - stream, - cudf::get_current_device_resource_ref()); + // Note that to normalize whitespaces in nested columns coerced to be string, we need the column + // to either be of mixed type or we need to request the column to be returned as string by + // pruning it with the STRING dtype + return get_tree_representation( + tokens_gpu, + token_indices_gpu, + options.is_enabled_mixed_types_as_string() || options.is_enabled_prune_columns(), + stream, + cudf::get_current_device_resource_ref()); }(); // IILE used to free memory of token data. #ifdef NJP_DEBUG_PRINT auto h_input = cudf::detail::make_host_vector_async(d_input, stream); diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 97d5884fef1..2d435dc8e1a 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -17,6 +17,7 @@ #include "io/fst/lookup_tables.cuh" #include +#include #include #include #include @@ -25,8 +26,17 @@ #include #include #include - +#include + +#include +#include +#include +#include +#include +#include #include +#include +#include #include #include @@ -215,14 +225,6 @@ std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. * - * NOTE: An important case NOT handled by this FST is that of whitespace following newline - * characters within a string. Consider the following example - * Input: {"a":"x\n y"} - * FST output: {"a":"x\ny"} - * Expected output: {"a":"x\n y"} - * Such strings are not part of the JSON standard (characters allowed within quotes should - * have ASCII at least 0x20 i.e. space character and above) but may be encountered while - * reading JSON files */ enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; // Aliases for readability of the transition table @@ -255,17 +257,17 @@ struct TransduceToNormalizedWS { // Let the alphabet set be Sigma // --------------------------------------- // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol + // Input symbol translates to output symbol // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // OOS | Sigma\{,\t} -> Sigma\{,\t} - // DEC | Sigma -> Sigma + // DQS | Sigma -> + // OOS | Sigma\{,\t} -> + // DEC | Sigma -> // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {} -> - // OOS | {\t} -> + // Output symbol same as input symbol + // OOS | {} -> {} + // OOS | {\t} -> {\t} - // Case when read symbol is a space or tab but is unquoted + // Case when read symbol is not an unquoted space or tab // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function // However, since there is no output in this case i.e. the count returned by // operator()(state_id, match_id, read_symbol) is zero, this function is never called. @@ -287,8 +289,8 @@ struct TransduceToNormalizedWS { SymbolT const read_symbol) const { // Case when read symbol is a space or tab but is unquoted - if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && - state_id == static_cast(dfa_states::TT_OOS)) { + if (!(match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && + state_id == static_cast(dfa_states::TT_OOS))) { return 0; } return 1; @@ -328,33 +330,126 @@ void normalize_single_quotes(datasource::owning_buffer& inda std::swap(indata, outdata); } -void normalize_whitespace(datasource::owning_buffer& indata, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + normalize_whitespace(device_span d_input, + device_span col_offsets, + device_span col_lengths, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - static constexpr std::int32_t min_out = 0; - static constexpr std::int32_t max_out = 2; + /* + * Algorithm: + 1. Create a single buffer by concatenating the rows of the string column. Create segment offsets + and lengths array for concatenated buffer + 2. Run a whitespace normalization FST that performs NOP for non-whitespace and quoted + whitespace characters, and outputs indices of unquoted whitespace characters + 3. Update segment lengths based on the number of output indices between segment offsets + 4. Remove characters at output indices from concatenated buffer. + 5. Return updated buffer, segment lengths and updated segment offsets + */ + auto inbuf_lengths = cudf::detail::make_device_uvector_async( + col_lengths, stream, cudf::get_current_device_resource_ref()); + size_t inbuf_lengths_size = inbuf_lengths.size(); + size_type inbuf_size = + thrust::reduce(rmm::exec_policy_nosync(stream), inbuf_lengths.begin(), inbuf_lengths.end()); + rmm::device_uvector inbuf(inbuf_size, stream); + rmm::device_uvector inbuf_offsets(inbuf_lengths_size, stream); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + inbuf_lengths.begin(), + inbuf_lengths.end(), + inbuf_offsets.begin(), + 0); + + auto input_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [d_input = d_input.begin(), col_offsets = col_offsets.begin()] __device__( + size_t i) -> char const* { return &d_input[col_offsets[i]]; })); + auto output_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [inbuf = inbuf.begin(), inbuf_offsets = inbuf_offsets.cbegin()] __device__( + size_t i) -> char* { return &inbuf[inbuf_offsets[i]]; })); + + { + // cub device batched copy + size_t temp_storage_bytes = 0; + cub::DeviceCopy::Batched(nullptr, + temp_storage_bytes, + input_it, + output_it, + inbuf_lengths.begin(), + inbuf_lengths_size, + stream.value()); + rmm::device_buffer temp_storage(temp_storage_bytes, stream); + cub::DeviceCopy::Batched(temp_storage.data(), + temp_storage_bytes, + input_it, + output_it, + inbuf_lengths.begin(), + inbuf_lengths_size, + stream.value()); + } + + // whitespace normalization : get the indices of the unquoted whitespace characters auto parser = fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), - fst::detail::make_translation_functor( + fst::detail::make_translation_functor( normalize_whitespace::TransduceToNormalizedWS{}), stream); - rmm::device_buffer outbuf(indata.size(), stream, mr); - rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(reinterpret_cast(indata.data()), - static_cast(indata.size()), - static_cast(outbuf.data()), + rmm::device_uvector outbuf_indices(inbuf.size(), stream, mr); + rmm::device_scalar outbuf_indices_size(stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), thrust::make_discard_iterator(), - outbuf_size.data(), + outbuf_indices.data(), + outbuf_indices_size.data(), normalize_whitespace::start_state, stream); - outbuf.resize(outbuf_size.value(stream), stream); - datasource::owning_buffer outdata(std::move(outbuf)); - std::swap(indata, outdata); + auto const num_deletions = outbuf_indices_size.value(stream); + outbuf_indices.resize(num_deletions, stream); + + // now these indices need to be removed + // TODO: is there a better way to do this? + thrust::for_each( + rmm::exec_policy_nosync(stream), + outbuf_indices.begin(), + outbuf_indices.end(), + [inbuf_offsets_begin = inbuf_offsets.begin(), + inbuf_offsets_end = inbuf_offsets.end(), + inbuf_lengths = inbuf_lengths.begin()] __device__(size_type idx) { + auto it = thrust::upper_bound(thrust::seq, inbuf_offsets_begin, inbuf_offsets_end, idx); + auto pos = thrust::distance(inbuf_offsets_begin, it) - 1; + cuda::atomic_ref ref{*(inbuf_lengths + pos)}; + ref.fetch_add(-1, cuda::std::memory_order_relaxed); + }); + + auto stencil = cudf::detail::make_zeroed_device_uvector_async( + static_cast(inbuf_size), stream, cudf::get_current_device_resource_ref()); + thrust::scatter(rmm::exec_policy_nosync(stream), + thrust::make_constant_iterator(true), + thrust::make_constant_iterator(true) + num_deletions, + outbuf_indices.begin(), + stencil.begin()); + thrust::remove_if(rmm::exec_policy_nosync(stream), + inbuf.begin(), + inbuf.end(), + stencil.begin(), + thrust::identity()); + inbuf.resize(inbuf_size - num_deletions, stream); + + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + inbuf_lengths.begin(), + inbuf_lengths.end(), + inbuf_offsets.begin(), + 0); + + stream.synchronize(); + return std::tuple{std::move(inbuf), std::move(inbuf_offsets), std::move(inbuf_lengths)}; } } // namespace detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 75639a0438f..83f71e657a7 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -299,22 +299,58 @@ get_array_children_indices(TreeDepthT row_array_children_level, device_span node_levels, device_span parent_node_ids, rmm::cuda_stream_view stream); + /** - * @brief Reduce node tree into column tree by aggregating each property of column. + * @brief Reduces node tree representation to column tree representation. * - * @param tree json node tree to reduce (modified in-place, but restored to original state) - * @param col_ids column ids of each node (modified in-place, but restored to original state) - * @param row_offsets row offsets of each node (modified in-place, but restored to original state) - * @param stream The CUDA stream to which kernels are dispatched - * @return A tuple containing the column tree, identifier for each column and the maximum row index - * in each column + * @param tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns */ std::tuple, rmm::device_uvector> reduce_to_column_tree(tree_meta_t& tree, - device_span col_ids, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); - +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 4e513d3495c..1c15e147b13 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2079,10 +2079,12 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; - parse_opts.dayfirst = options.is_enabled_dayfirst(); - parse_opts.keepquotes = options.is_enabled_keep_quotes(); - parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.dayfirst = options.is_enabled_dayfirst(); + parse_opts.keepquotes = options.is_enabled_keep_quotes(); + parse_opts.normalize_whitespace = options.is_enabled_normalize_whitespace(); + parse_opts.mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); std::vector na_values{"", "null"}; na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index bd82b040359..99a5b17bce8 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -232,12 +232,6 @@ table_with_metadata read_batch(host_span> sources, normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); } - // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is - // enabled, invoke pre-processing FST - if (reader_opts.is_enabled_normalize_whitespace()) { - normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref()); - } - auto buffer = cudf::device_span(reinterpret_cast(bufview.data()), bufview.size()); stream.synchronize(); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 81fd4ab9f82..ec05f35d405 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const table_size = std::reduce(column_sizes.begin(), column_sizes.end()); auto const avg_row_len = util::div_rounding_up_safe(table_size, input.num_rows()); if (avg_row_len > 0) { - auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len); - max_page_fragment_size = std::min(rg_frag_size, max_page_fragment_size); + // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size + // value (=uint64::max) to avoid a sign overflow when comparing + auto const rg_frag_size = + std::min(std::numeric_limits::max(), + util::div_rounding_up_safe(max_row_group_size, avg_row_len)); + // Safe comparison as rg_frag_size fits in size_type + max_page_fragment_size = + std::min(static_cast(rg_frag_size), max_page_fragment_size); } // dividing page size by average row length will tend to overshoot the desired diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index bc2722441d0..734067582f7 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -67,6 +67,8 @@ struct parse_options_view { bool doublequote; bool dayfirst; bool skipblanklines; + bool normalize_whitespace; + bool mixed_types_as_string; cudf::detail::trie_view trie_true; cudf::detail::trie_view trie_false; cudf::detail::trie_view trie_na; @@ -85,6 +87,8 @@ struct parse_options { bool doublequote; bool dayfirst; bool skipblanklines; + bool normalize_whitespace; + bool mixed_types_as_string; cudf::detail::optional_trie trie_true; cudf::detail::optional_trie trie_false; cudf::detail::optional_trie trie_na; @@ -111,6 +115,8 @@ struct parse_options { doublequote, dayfirst, skipblanklines, + normalize_whitespace, + mixed_types_as_string, cudf::detail::make_trie_view(trie_true), cudf::detail::make_trie_view(trie_false), cudf::detail::make_trie_view(trie_na), diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 748691fb7d1..2ec23e0dc6d 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -377,16 +376,12 @@ conditional_inner_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::INNER_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::INNER_JOIN, output_size, stream, mr); } std::pair>, @@ -395,16 +390,12 @@ conditional_left_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::LEFT_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, output_size, stream, mr); } std::pair>, @@ -412,16 +403,12 @@ std::pair>, conditional_full_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::FULL_JOIN, - {}, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::FULL_JOIN, {}, stream, mr); } std::unique_ptr> conditional_left_semi_join( @@ -429,16 +416,12 @@ std::unique_ptr> conditional_left_semi_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join_anti_semi(left, - right, - binary_predicate, - detail::join_kind::LEFT_SEMI_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join_anti_semi( + left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, output_size, stream, mr); } std::unique_ptr> conditional_left_anti_join( @@ -446,64 +429,56 @@ std::unique_ptr> conditional_left_anti_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join_anti_semi(left, - right, - binary_predicate, - detail::join_kind::LEFT_ANTI_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join_anti_semi( + left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, output_size, stream, mr); } std::size_t conditional_inner_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr); + left, right, binary_predicate, detail::join_kind::INNER_JOIN, stream, mr); } std::size_t conditional_left_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr); + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, stream, mr); } std::size_t conditional_left_semi_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_conditional_join_output_size(left, - right, - binary_predicate, - detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), - mr); + return detail::compute_conditional_join_output_size( + left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, stream, mr); } std::size_t conditional_left_anti_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_conditional_join_output_size(left, - right, - binary_predicate, - detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), - mr); + return detail::compute_conditional_join_output_size( + left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp index 4f6a9484e8c..303442e79ef 100644 --- a/cpp/src/join/conditional_join.hpp +++ b/cpp/src/join/conditional_join.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu index eeb49736bac..15594fb60e3 100644 --- a/cpp/src/join/cross_join.cu +++ b/cpp/src/join/cross_join.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -75,10 +74,11 @@ std::unique_ptr cross_join(cudf::table_view const& left, std::unique_ptr cross_join(cudf::table_view const& left, cudf::table_view const& right, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::cross_join(left, right, cudf::get_default_stream(), mr); + return detail::cross_join(left, right, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 0abff27667b..7b13c260364 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -120,10 +119,11 @@ std::pair>, inner_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::inner_join(left, right, compare_nulls, stream, mr); } std::pair>, @@ -131,10 +131,11 @@ std::pair>, left_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::left_join(left, right, compare_nulls, stream, mr); } std::pair>, @@ -142,10 +143,11 @@ std::pair>, full_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::full_join(left, right, compare_nulls, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 8ff78dd47f4..820b81ee309 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -484,6 +483,7 @@ mixed_inner_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -495,7 +495,7 @@ mixed_inner_join( compare_nulls, detail::join_kind::INNER_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } @@ -506,6 +506,7 @@ std::pair>> mixed_in table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -516,7 +517,7 @@ std::pair>> mixed_in binary_predicate, compare_nulls, detail::join_kind::INNER_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -530,6 +531,7 @@ mixed_left_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -541,7 +543,7 @@ mixed_left_join( compare_nulls, detail::join_kind::LEFT_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } @@ -552,6 +554,7 @@ std::pair>> mixed_le table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -562,7 +565,7 @@ std::pair>> mixed_le binary_predicate, compare_nulls, detail::join_kind::LEFT_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -576,6 +579,7 @@ mixed_full_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -587,7 +591,7 @@ mixed_full_join( compare_nulls, detail::join_kind::FULL_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index cfb785e242c..aa4fa281159 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -267,6 +266,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -277,7 +277,7 @@ std::unique_ptr> mixed_left_semi_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -288,6 +288,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -298,7 +299,7 @@ std::unique_ptr> mixed_left_anti_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index f69ded73e8d..d2ab2122c75 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -98,22 +97,24 @@ std::unique_ptr> left_semi_join( cudf::table_view const& left, cudf::table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); + detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, stream, mr); } std::unique_ptr> left_anti_join( cudf::table_view const& left, cudf::table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); + detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, stream, mr); } } // namespace cudf diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index adf650a4f27..7c4c89bd3fb 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -539,15 +539,26 @@ class regex_parser { : static_cast(LBRA); case ')': return RBRA; case '^': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return BOL; } case '$': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return EOL; } case '[': return build_cclass(); - case '.': return dot_type; + case '.': { + _chr = is_ext_newline(_flags) ? 'N' : chr; + return dot_type; + } } if (std::find(quantifiers.begin(), quantifiers.end(), static_cast(chr)) == @@ -959,7 +970,7 @@ class regex_compiler { _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { _prog.inst_at(inst_id).u1.c = yy; - } else if (token == BOL || token == EOL) { + } else if (token == BOL || token == EOL || token == ANY) { _prog.inst_at(inst_id).u1.c = yy; } push_and(inst_id, inst_id); @@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags) case STAR: printf(" STAR next=%d", inst.u2.next_id); break; case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; - case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break; case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 3b899e4edc1..e34a1e12015 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() list2 = tmp; } +/** + * @brief Check for supported new-line characters + * + * '\n, \r, \u0085, \u2028, or \u2029' + */ +constexpr bool is_newline(char32_t const ch) +{ + return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); +} + /** * @brief Utility to check a specific character against this class instance. * @@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const if (checkstart) { auto startchar = static_cast(jnk.startchar); switch (jnk.starttype) { - case BOL: - if (pos == 0) break; - if (jnk.startchar != '^') { return cuda::std::nullopt; } + case BOL: { + if (pos == 0) { break; } + if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; } + if (startchar != '\n') { break; } --itr; startchar = static_cast('\n'); + [[fallthrough]]; + } case CHAR: { auto const find_itr = find_char(startchar, dstr, itr); if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; } @@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const id_activate = inst.u2.next_id; expanded = true; break; - case BOL: - if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) { + case BOL: { + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; + if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) || + ((inst.u1.c == 'S') && (is_newline(prev_c)))) { id_activate = inst.u2.next_id; expanded = true; } break; - case EOL: + } + case EOL: { // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line + bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); if (last_character || - ((c == '\n') && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) { + (nl && (inst.u1.c != 'Z') && + ((inst.u1.c == '$' || inst.u1.c == 'S') || + (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; } break; + } case BOW: case NBOW: { - auto const prev_c = pos > 0 ? dstr[pos - 1] : 0; + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; auto const word_class = reclass_device{CCLASS_W}; bool const curr_is_word = word_class.is_match(c, _codepoint_flags); bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags); @@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case CHAR: if (inst.u1.c == c) id_activate = inst.u2.next_id; break; - case ANY: - if (c != '\n') id_activate = inst.u2.next_id; - break; + case ANY: { + if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; } + [[fallthrough]]; + } case ANYNL: id_activate = inst.u2.next_id; break; case NCCLASS: case CCLASS: { diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 605582f28a6..a03a34f5fa7 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -151,15 +153,111 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; minhash_kernel<<>>( *d_strings, seeds, width, d_hashes); return hashes; } -std::unique_ptr build_list_result(cudf::strings_column_view const& input, +/** + * @brief Compute the minhash of each list row of strings for each seed + * + * This is a warp-per-row algorithm where parallel threads within a warp + * work on strings in a single list row. + * + * @tparam HashFunction hash function to use on each string + * + * @param d_input List of strings to process + * @param seeds Seeds for hashing each string + * @param d_hashes Minhash output values (one per row) + */ +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input, + cudf::device_span seeds, + hash_value_type* d_hashes) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const row_idx = idx / cudf::detail::warp_size; + + if (row_idx >= d_input.size()) { return; } + if (d_input.is_null(row_idx)) { return; } + + auto const d_row = cudf::list_device_view(d_input, row_idx); + auto const d_output = d_hashes + (row_idx * seeds.size()); + + // initialize hashes output for this row + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + if (lane_idx == 0) { + auto const init = d_row.size() == 0 ? 0 : std::numeric_limits::max(); + thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); + } + __syncwarp(); + + // each lane hashes a string from the input row + for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) { + auto const hash_str = + d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element(str_idx); + for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { + auto const hasher = HashFunction(seeds[seed_idx]); + // hash string and store the min value + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values + // but only uses the first uint64 value as requested by the LLM team. + hv = thrust::get<0>(hasher(hash_str)); + } + cuda::atomic_ref ref{*(d_output + seed_idx)}; + ref.fetch_min(hv, cuda::std::memory_order_relaxed); + } + } +} + +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +std::unique_ptr word_minhash_fn(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_input = cudf::column_device_view::create(input.parent(), stream); + + auto hashes = cudf::make_numeric_column(output_type, + input.size() * static_cast(seeds.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_hashes = hashes->mutable_view().data(); + auto lcdv = cudf::detail::lists_column_device_view(*d_input); + + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; + minhash_word_kernel + <<>>(lcdv, seeds, d_hashes); + + return hashes; +} + +std::unique_ptr build_list_result(cudf::column_view const& input, std::unique_ptr&& hashes, cudf::size_type seeds_size, rmm::cuda_stream_view stream, @@ -176,7 +274,7 @@ std::unique_ptr build_list_result(cudf::strings_column_view const& std::move(offsets), std::move(hashes), input.null_count(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input, stream, mr), stream, mr); // expect this condition to be very rare @@ -208,7 +306,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, @@ -232,7 +330,27 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } } // namespace detail @@ -276,4 +394,21 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash(input, seeds, stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash64(input, seeds, stream, mr); +} } // namespace nvtext diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9d3a7ce5a4e..9824c472b20 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -132,6 +132,13 @@ struct cuda_event { cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); } virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); } + // Moveable but not copyable. + cuda_event(const cuda_event&) = delete; + cuda_event& operator=(const cuda_event&) = delete; + + cuda_event(cuda_event&&) = default; + cuda_event& operator=(cuda_event&&) = default; + operator cudaEvent_t() { return e_; } private: @@ -147,11 +154,12 @@ struct cuda_event { */ cudaEvent_t event_for_thread() { - thread_local std::vector> thread_events(get_num_cuda_devices()); + // The program may crash if this function is called from the main thread and user application + // subsequently calls cudaDeviceReset(). + // As a workaround, here we intentionally disable RAII and leak cudaEvent_t. + thread_local std::vector thread_events(get_num_cuda_devices()); auto const device_id = get_current_cuda_device(); - if (not thread_events[device_id.value()]) { - thread_events[device_id.value()] = std::make_unique(); - } + if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); } return *thread_events[device_id.value()]; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1bedb344a01..288fa84a73d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -687,10 +687,12 @@ ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_DATETIME_TEST streams/datetime_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_JOIN_TEST streams/join_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 960c19fce2e..48bc982d0e3 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2856,4 +2856,47 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) } } +TEST_F(JsonReaderTest, JsonDtypeSchema) +{ + std::string data = R"( + {"a": 1, "b": {"0": "abc", "1": ["a", "b"]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {"0": "lolol "}, "c": true} + )"; + + std::map dtype_schema{{"c", {data_type{type_id::STRING}}}, + {"b", {data_type{type_id::STRING}}}, + {"a", {dtype()}}}; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .prune_columns(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 3); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + + // cudf::column::contents contents = result.tbl->get_column(1).release(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), + cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}", + "{\"0\": \"abc\" }", + "{\"0\": \"lolol \"}"}), + cudf::test::debug_output_level::ALL_ERRORS); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), + cudf::test::strings_column_wrapper({"true", "false", "true"}), + cudf::test::debug_output_level::ALL_ERRORS); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu index 6d79fdc98ef..6a3bd69de81 100644 --- a/cpp/tests/io/json/json_whitespace_normalization_test.cu +++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu @@ -34,129 +34,127 @@ // Base test fixture for tests struct JsonWSNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) -{ - // Prepare cuda stream for data transfers & kernels - auto stream_view = cudf::test::get_default_stream(); - - auto device_input = rmm::device_buffer( - host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref()); - - // Preprocessing FST - cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_whitespace( - device_data, stream_view, cudf::get_current_device_resource_ref()); - - std::string preprocessed_host_output(device_data.size(), 0); - CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), - device_data.data(), - preprocessed_host_output.size(), - cudaMemcpyDeviceToHost, - stream_view.value())); - - stream_view.synchronize(); - ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL( - preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); -} - -TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces) +TEST_F(JsonWSNormalizationTest, ReadJsonOption) { - std::string input = R"({ "A" : "TEST" })"; - std::string output = R"({"A":"TEST"})"; - run_test(input, output); -} + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace -TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces) -{ - std::string input = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})"; - std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})"; - run_test(input, output); -} + // Test input + std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true); -TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesInString) -{ - std::string input = R"({" a ":50})"; - std::string output = R"({" a ":50})"; - run_test(input, output); -} + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); -TEST_F(JsonWSNormalizationTest, GroundTruth_NewlineInString) -{ - std::string input = "{\"a\" : \"x\ny\"}\n{\"a\" : \"x\\ny\"}"; - std::string output = "{\"a\":\"x\ny\"}\n{\"a\":\"x\\ny\"}"; - run_test(input, output); -} + // Expected table + std::string const expected_input = R"({ "a" : {"b":"c"}})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false); -TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs) -{ - std::string input = "{\"a\":\t\"b\"}"; - std::string output = R"({"a":"b"})"; - run_test(input, output); + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } -TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs) +TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows) { - std::string input = "{\"A\" : \t\"TEST\" }"; - std::string output = R"({"A":"TEST"})"; - run_test(input, output); -} + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace -TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs) -{ - std::string input = - "{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, " - "\"bar\trapids\": 456 }"; - std::string output = - "{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}"; - run_test(input, output); -} + // Test input + std::string const host_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{ "YY": 1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [ { "EE": "efg" } ] } } + )"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); -TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample) -{ - std::string input = R"([{"a":50}, {"a" : 60}])"; - std::string output = R"([{"a":50},{"a":60}])"; - run_test(input, output); -} + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); -TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired) -{ - std::string input = R"({"a\\n\r\a":50})"; - std::string output = R"({"a\\n\r\a":50})"; - run_test(input, output); -} + // Expected table + std::string const expected_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{"YY":1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [{"EE":"efg"}] } } + )"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); -TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput) -{ - std::string input = "{\"a\" : \"b }\n{ \"c \" :\t\"d\"}"; - std::string output = "{\"a\":\"b }\n{\"c \":\"d\"}"; - run_test(input, output); + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } -TEST_F(JsonWSNormalizationTest, ReadJsonOption) +TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows_NoMixedType) { // When mixed type fields are read as strings, the table read will differ depending the // value of normalize_whitespace // Test input - std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + std::string const host_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{ "YY": 1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [ { "EE": "efg" }, { "YY" : "abc" } ] } } + { "Root": { "Key": [ { "YY" : "abc" } ] } } + )"; + + std::map dtype_schema{ + {"Key", {cudf::data_type{cudf::type_id::STRING}}}}; + cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) + .dtypes(dtype_schema) .lines(true) - .mixed_types_as_string(true) - .normalize_whitespace(true); + .prune_columns(true) + .normalize_whitespace(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); // Expected table - std::string const expected_input = R"({ "a" : {"b":"c"}})"; + std::string const expected_input = R"( + { "Root": { "Key": [ { "EE": tr ue } , { "YY" : 2 } ] } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{"YY":1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [{"EE":"efg"},{"YY":"abc"}] } } + { "Root": { "Key": [{"YY":"abc"}] } } + )"; + cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) + .dtypes(dtype_schema) .lines(true) - .mixed_types_as_string(true) - .normalize_whitespace(false); + .prune_columns(true) + .normalize_whitespace(false) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index ab387a5c7f5..3431e941359 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -39,6 +39,8 @@ #include #include +#include + #include template @@ -60,6 +62,7 @@ template >, cudf::table_view const& left_keys, cudf::table_view const& right_keys, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr), cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK> std::unique_ptr join_and_gather( @@ -68,12 +71,13 @@ std::unique_ptr join_and_gather( std::vector const& left_on, std::vector const& right_on, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { auto left_selected = left_input.select(left_on); auto right_selected = right_input.select(right_on); auto const [left_join_indices, right_join_indices] = - join_impl(left_selected, right_selected, compare_nulls, mr); + join_impl(left_selected, right_selected, compare_nulls, stream, mr); auto left_indices_span = cudf::device_span{*left_join_indices}; auto right_indices_span = cudf::device_span{*right_join_indices}; @@ -2027,7 +2031,11 @@ struct JoinTestLists : public cudf::test::BaseFixture { auto const probe_tv = cudf::table_view{{probe}}; auto const [left_result_map, right_result_map] = - join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref()); + join_func(build_tv, + probe_tv, + nulls_equal, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); auto const left_result_table = sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy); diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp index 3e279260b99..554d5754e39 100644 --- a/cpp/tests/join/semi_anti_join_tests.cpp +++ b/cpp/tests/join/semi_anti_join_tests.cpp @@ -28,8 +28,11 @@ #include #include #include +#include #include +#include + #include template @@ -51,6 +54,7 @@ template > (*join_impl)( cudf::table_view const& left_keys, cudf::table_view const& right_keys, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)> std::unique_ptr join_and_gather( cudf::table_view const& left_input, @@ -58,11 +62,12 @@ std::unique_ptr join_and_gather( std::vector const& left_on, std::vector const& right_on, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { auto left_selected = left_input.select(left_on); auto right_selected = right_input.select(right_on); - auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, mr); + auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, stream, mr); auto left_indices_span = cudf::device_span{*join_indices}; auto left_indices_col = cudf::column_view{left_indices_span}; diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp new file mode 100644 index 00000000000..82629156fa6 --- /dev/null +++ b/cpp/tests/streams/datetime_test.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class DatetimeTest : public cudf::test::BaseFixture { + public: + cudf::test::fixed_width_column_wrapper timestamps{ + -23324234, // 1969-12-31 23:59:59.976675766 GMT + 23432424, // 1970-01-01 00:00:00.023432424 GMT + 987234623 // 1970-01-01 00:00:00.987234623 GMT + }; + cudf::test::fixed_width_column_wrapper months{{1, -1, 3}}; +}; + +TEST_F(DatetimeTest, ExtractYear) +{ + cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMonth) +{ + cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractDay) +{ + cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractWeekday) +{ + cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractHour) +{ + cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMinute) +{ + cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractSecond) +{ + cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMillisecondFraction) +{ + cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMicrosecondFraction) +{ + cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractNanosecondFraction) +{ + cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, LastDayOfMonth) +{ + cudf::datetime::last_day_of_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, DayOfYear) +{ + cudf::datetime::day_of_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, AddCalendricalMonths) +{ + cudf::datetime::add_calendrical_months(timestamps, months, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, AddCalendricalMonthsScalar) +{ + auto scalar = cudf::make_fixed_width_scalar(1, cudf::test::get_default_stream()); + + cudf::datetime::add_calendrical_months(timestamps, *scalar, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, IsLeapYear) +{ + cudf::datetime::is_leap_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, DaysInMonth) +{ + cudf::datetime::days_in_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractQuarter) +{ + cudf::datetime::extract_quarter(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, CeilDatetimes) +{ + cudf::datetime::ceil_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, FloorDatetimes) +{ + cudf::datetime::floor_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, RoundDatetimes) +{ + cudf::datetime::round_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp new file mode 100644 index 00000000000..2811bb676fa --- /dev/null +++ b/cpp/tests/streams/join_test.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +class JoinTest : public cudf::test::BaseFixture { + static inline cudf::table make_table() + { + cudf::test::fixed_width_column_wrapper col0{{3, 1, 2, 0, 3}}; + cudf::test::strings_column_wrapper col1{{"s0", "s1", "s2", "s4", "s1"}}; + cudf::test::fixed_width_column_wrapper col2{{0, 1, 2, 4, 1}}; + + std::vector> columns; + columns.push_back(col0.release()); + columns.push_back(col1.release()); + columns.push_back(col2.release()); + + return cudf::table{std::move(columns)}; + } + + public: + cudf::table table0{make_table()}; + cudf::table table1{make_table()}; + cudf::table conditional0{make_table()}; + cudf::table conditional1{make_table()}; + cudf::ast::column_reference col_ref_left_0{0}; + cudf::ast::column_reference col_ref_right_0{0, cudf::ast::table_reference::RIGHT}; + cudf::ast::operation left_zero_eq_right_zero{ + cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0}; +}; + +TEST_F(JoinTest, InnerJoin) +{ + cudf::inner_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftJoin) +{ + cudf::left_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, FullJoin) +{ + cudf::full_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftSemiJoin) +{ + cudf::left_semi_join( + table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftAntiJoin) +{ + cudf::left_anti_join( + table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, CrossJoin) { cudf::cross_join(table0, table1, cudf::test::get_default_stream()); } + +TEST_F(JoinTest, ConditionalInnerJoin) +{ + cudf::conditional_inner_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftJoin) +{ + cudf::conditional_left_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalFullJoin) +{ + cudf::conditional_full_join( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftSemiJoin) +{ + cudf::conditional_left_semi_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftAntiJoin) +{ + cudf::conditional_left_anti_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedInnerJoin) +{ + cudf::mixed_inner_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftJoin) +{ + cudf::mixed_left_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedFullJoin) +{ + cudf::mixed_full_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftSemiJoin) +{ + cudf::mixed_left_semi_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftAntiJoin) +{ + cudf::mixed_left_anti_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedInnerJoinSize) +{ + cudf::mixed_inner_join_size(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftJoinSize) +{ + cudf::mixed_left_join_size(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalInnerJoinSize) +{ + cudf::conditional_inner_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftJoinSize) +{ + cudf::conditional_left_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftSemiJoinSize) +{ + cudf::conditional_left_semi_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftAntiJoinSize) +{ + cudf::conditional_left_anti_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index c816316d0ff..acf850c7a66 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } +TEST_F(StringsContainsTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé", + "qqq\rzzé" LINE_SEPARATOR "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\nzzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto ml_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, ml_flags); + + auto expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + auto results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto counts = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + + pattern = std::string("q.*l"); + prog = cudf::strings::regex_program::create(pattern); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // inst ANY will stop matching on first 'newline' and so should not match anything here + prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // including the DOTALL flag accepts the newline characters + auto dot_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::DOTALL); + prog = cudf::strings::regex_program::create(pattern, dot_flags); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, EndOfString) { auto input = cudf::test::strings_column_wrapper( diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index b26cbd5a549..1491da758d5 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include +#include #include #include @@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsExtractTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé", + "qqq" LINE_SEPARATOR "zzé\rlll", + "zzé", + "", + "zzé" NEXT_LINE, + "abc" PARAGRAPH_SEPARATOR "zzé\n"}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::extract(view, *prog); + auto expected = + cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::extract(view, *prog_ml); + expected = + cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + prog = cudf::strings::regex_program::create("q(q.*l)l"); + expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""}, + {0, 1, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + // expect no matches here since the newline(s) interrupts the pattern + prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4582dcb1e38..47606b9b3ed 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé", + "qqq\nzzé" PARAGRAPH_SEPARATOR "lll", + "zzé", + "", + "zzé\r", + "zzé" LINE_SEPARATOR "zzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::findall(view, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::findall(view, *prog_ml); + LCW expected_ml( + {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 8c0482653fb..9847d8d6bb5 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); } +TEST_F(StringsReplaceRegexTest, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + auto view = cudf::strings_column_view(input); + auto repl = cudf::string_scalar("_"); + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::replace_re(view, *prog, repl); + auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_re(view, *prog_ml, repl); + expected = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_", + "qqq" NEXT_LINE "_" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\r_\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto repl_template = std::string("[\\1]"); + pattern = std::string("(^zzé$)"); + prog = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_with_backrefs(view, *prog, repl_template); + expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]", + "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll", + "[zzé]", + "", + "[zzé]" PARAGRAPH_SEPARATOR, + "abc\r[zzé]\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h new file mode 100644 index 00000000000..0d630f6bb52 --- /dev/null +++ b/cpp/tests/strings/special_chars.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::test { + +// special new-line characters for use with regex_flags::EXT_NEWLINE +#define NEXT_LINE "\xC2\x85" +#define LINE_SEPARATOR "\xE2\x80\xA8" +#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9" + +} // namespace cudf::test diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 7575a3ba846..e23f3f6e7d8 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } +TEST_F(MinHashTest, WordsMinHash) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto validity = cudf::test::iterators::null_at(1); + + LCWS input( + {LCWS({"hello", "abcdéfgh"}), + LCWS{}, + LCWS({"rapids", "moré", "test", "text"}), + LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, + validity); + + auto view = cudf::lists_column_view(input); + + auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); + auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); + using LCW32 = cudf::test::lists_column_wrapper; + LCW32 expected({LCW32{2069617641u, 1975382903u}, + LCW32{}, + LCW32{657297235u, 1010955999u}, + LCW32{644643885u, 310002789u}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); + auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); + using LCW64 = cudf::test::lists_column_wrapper; + LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, + LCW64{}, + LCW64{5331949571924938590ul, 2088583894581919741ul}, + LCW64{3400468157617183341ul, 2398577492366130055ul}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + TEST_F(MinHashTest, EmptyTest) { auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); diff --git a/dependencies.yaml b/dependencies.yaml index 7a13043cc5f..9c95b9f399f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -43,15 +43,28 @@ files: includes: - cuda_version - test_cpp - test_python: + test_python_cudf_pandas: output: none includes: - cuda_version - py_version - test_python_common - test_python_cudf - - test_python_dask_cudf - test_python_cudf_pandas + test_python_cudf: + output: none + includes: + - cuda_version + - py_version + - test_python_common + - test_python_cudf + test_python_other: + output: none + includes: + - cuda_version + - py_version + - test_python_common + - test_python_dask_cudf test_java: output: none includes: @@ -350,12 +363,12 @@ dependencies: common: - output_types: conda packages: - - fmt>=10.1.1,<11 + - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 # Align nvcomp version with rapids-cmake - nvcomp==4.0.1 - - spdlog>=1.12.0,<1.13 + - spdlog>=1.14.1,<1.15 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] @@ -514,7 +527,7 @@ dependencies: - output_types: [conda] packages: - breathe>=4.35.0 - - dask-cuda==24.10.*,>=0.0.0a0 + - dask-cuda==24.12.*,>=0.0.0a0 - *doxygen - make - myst-nb @@ -650,12 +663,12 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.0,<1.3 + - polars>=1.6 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] packages: - - rapids-dask-dependency==24.10.*,>=0.0.0a0 + - rapids-dask-dependency==24.12.*,>=0.0.0a0 run_custreamz: common: - output_types: conda @@ -707,9 +720,7 @@ dependencies: - matrix: {dependencies: "oldest"} packages: - numba==0.57.* - - numpy==1.23.* - pandas==2.0.* - - pyarrow==14.0.0 - matrix: packages: - output_types: conda @@ -764,6 +775,14 @@ dependencies: - &transformers transformers==4.39.3 - tzdata specific: + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numpy==1.23.* + - pyarrow==14.0.0 + - matrix: + packages: - output_types: conda matrices: - matrix: @@ -781,13 +800,22 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==24.10.*,>=0.0.0a0 + - dask-cuda==24.12.*,>=0.0.0a0 - *numba + specific: + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numpy==1.24.* + - pyarrow==14.0.1 + - matrix: + packages: depends_on_libcudf: common: - output_types: conda packages: - - &libcudf_unsuffixed libcudf==24.10.*,>=0.0.0a0 + - &libcudf_unsuffixed libcudf==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -801,18 +829,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libcudf-cu12==24.10.*,>=0.0.0a0 + - libcudf-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - libcudf-cu11==24.10.*,>=0.0.0a0 + - libcudf-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*libcudf_unsuffixed]} depends_on_pylibcudf: common: - output_types: conda packages: - - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0 + - &pylibcudf_unsuffixed pylibcudf==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -826,18 +854,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pylibcudf-cu12==24.10.*,>=0.0.0a0 + - pylibcudf-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - pylibcudf-cu11==24.10.*,>=0.0.0a0 + - pylibcudf-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*pylibcudf_unsuffixed]} depends_on_cudf: common: - output_types: conda packages: - - &cudf_unsuffixed cudf==24.10.*,>=0.0.0a0 + - &cudf_unsuffixed cudf==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -851,18 +879,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf-cu12==24.10.*,>=0.0.0a0 + - cudf-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - cudf-cu11==24.10.*,>=0.0.0a0 + - cudf-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_unsuffixed]} depends_on_cudf_kafka: common: - output_types: conda packages: - - &cudf_kafka_unsuffixed cudf_kafka==24.10.*,>=0.0.0a0 + - &cudf_kafka_unsuffixed cudf_kafka==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -876,12 +904,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf_kafka-cu12==24.10.*,>=0.0.0a0 + - cudf_kafka-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - cudf_kafka-cu11==24.10.*,>=0.0.0a0 + - cudf_kafka-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_kafka_unsuffixed]} depends_on_cupy: common: @@ -902,7 +930,7 @@ dependencies: common: - output_types: conda packages: - - &libkvikio_unsuffixed libkvikio==24.10.*,>=0.0.0a0 + - &libkvikio_unsuffixed libkvikio==24.12.*,>=0.0.0a0 - output_types: requirements packages: - --extra-index-url=https://pypi.nvidia.com @@ -914,12 +942,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libkvikio-cu12==24.10.*,>=0.0.0a0 + - libkvikio-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - libkvikio-cu11==24.10.*,>=0.0.0a0 + - libkvikio-cu11==24.12.*,>=0.0.0a0 - matrix: packages: - *libkvikio_unsuffixed @@ -927,7 +955,7 @@ dependencies: common: - output_types: conda packages: - - &librmm_unsuffixed librmm==24.10.*,>=0.0.0a0 + - &librmm_unsuffixed librmm==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -941,12 +969,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - librmm-cu12==24.10.*,>=0.0.0a0 + - librmm-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - librmm-cu11==24.10.*,>=0.0.0a0 + - librmm-cu11==24.12.*,>=0.0.0a0 - matrix: packages: - *librmm_unsuffixed @@ -954,7 +982,7 @@ dependencies: common: - output_types: conda packages: - - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0 + - &rmm_unsuffixed rmm==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -968,12 +996,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - rmm-cu12==24.10.*,>=0.0.0a0 + - rmm-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - rmm-cu11==24.10.*,>=0.0.0a0 + - rmm-cu11==24.12.*,>=0.0.0a0 - matrix: packages: - *rmm_unsuffixed diff --git a/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png new file mode 100644 index 00000000000..e472cf66612 Binary files /dev/null and b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png differ diff --git a/docs/cudf/source/_static/compute_heavy_queries_polars.png b/docs/cudf/source/_static/compute_heavy_queries_polars.png new file mode 100644 index 00000000000..6854ed5a436 Binary files /dev/null and b/docs/cudf/source/_static/compute_heavy_queries_polars.png differ diff --git a/docs/cudf/source/_static/pds_benchmark_polars.png b/docs/cudf/source/_static/pds_benchmark_polars.png new file mode 100644 index 00000000000..d0b48ab2901 Binary files /dev/null and b/docs/cudf/source/_static/pds_benchmark_polars.png differ diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index 0398a8d7086..41838e01dd9 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the ```bash python -m cudf.pandas --profile script.py ``` + +### cudf.pandas CLI Features + +Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin. + +Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter. + +The `-c` flag accepts a code string to run, like this: + +```bash +$ python -m cudf.pandas -c "import pandas; print(pandas)" + +``` + +Users can also provide code to execute from stdin, like this: + +```bash +$ echo "import pandas; print(pandas)" | python -m cudf.pandas + +``` diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst new file mode 100644 index 00000000000..0a3a0d86b2c --- /dev/null +++ b/docs/cudf/source/cudf_polars/index.rst @@ -0,0 +1,41 @@ +Polars GPU engine +================= + +cuDF provides an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API. +The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations +and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine +whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine +and run on the CPU. + +Benchmark +--------- +We reproduced the `Polars Decision Support (PDS) `__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results: + +.. figure:: ../_static/pds_benchmark_polars.png + :width: 600px + + + +You can see up to 13x speedup using the GPU engine on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries: + + +.. figure:: ../_static/compute_heavy_queries_polars.png + :width: 1000px + +:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe` + +You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository `__. + +Learn More +---------- + +The GPU engine for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page `__ on the Polars website. + +Launch on Google Colab +---------------------- + +.. figure:: ../_static/colab.png + :width: 200px + :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb + + Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab `__. diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst index 3b8dfa5fe01..1b86cafeb48 100644 --- a/docs/cudf/source/index.rst +++ b/docs/cudf/source/index.rst @@ -29,5 +29,6 @@ other operations. user_guide/index cudf_pandas/index + cudf_polars/index libcudf_docs/index developer_guide/index diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst new file mode 100644 index 00000000000..06f74a38709 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst @@ -0,0 +1,6 @@ +======= +extract +======= + +.. automodule:: pylibcudf.strings.extract + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst new file mode 100644 index 00000000000..9850ee10098 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst @@ -0,0 +1,6 @@ +==== +find +==== + +.. automodule:: pylibcudf.strings.findall + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 462a756a092..9b1a6b72a88 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -7,8 +7,12 @@ strings capitalize char_types contains + extract find + findall regex_flags regex_program + repeat replace slice + strip diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst new file mode 100644 index 00000000000..0041fe4c3da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst @@ -0,0 +1,6 @@ +====== +repeat +====== + +.. automodule:: pylibcudf.strings.repeat + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst new file mode 100644 index 00000000000..a79774b8e67 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst @@ -0,0 +1,6 @@ +===== +strip +===== + +.. automodule:: pylibcudf.strings.strip + :members: diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst new file mode 100644 index 00000000000..142124163af --- /dev/null +++ b/docs/dask_cudf/source/best_practices.rst @@ -0,0 +1,320 @@ +.. _best-practices: + +Dask cuDF Best Practices +======================== + +This page outlines several important guidelines for using `Dask cuDF +`__ effectively. + +.. note:: + Since Dask cuDF is a backend extension for + `Dask DataFrame `__, + the guidelines discussed in the `Dask DataFrames Best Practices + `__ + documentation also apply to Dask cuDF (excluding any pandas-specific + details). + + +Deployment and Configuration +---------------------------- + +Use Dask-CUDA +~~~~~~~~~~~~~ + +To execute a Dask workflow on multiple GPUs, a Dask cluster must +be deployed with `Dask-CUDA `__ +and `Dask.distributed `__. + +When running on a single machine, the `LocalCUDACluster `__ +convenience function is strongly recommended. No matter how many GPUs are +available on the machine (even one!), using `Dask-CUDA has many advantages +`__ +over default (threaded) execution. Just to list a few: + +* Dask-CUDA makes it easy to pin workers to specific devices. +* Dask-CUDA makes it easy to configure memory-spilling options. +* The distributed scheduler collects useful diagnostic information that can be viewed on a dashboard in real time. + +Please see `Dask-CUDA's API `__ +and `Best Practices `__ +documentation for detailed information. Typical ``LocalCUDACluster`` usage +is also illustrated within the multi-GPU section of `Dask cuDF's +`__ documentation. + +.. note:: + When running on cloud infrastructure or HPC systems, it is usually best to + leverage system-specific deployment libraries like `Dask Operator + `__ and `Dask-Jobqueue + `__. + + Please see `the RAPIDS deployment documentation `__ + for further details and examples. + + +Use diagnostic tools +~~~~~~~~~~~~~~~~~~~~ + +The Dask ecosystem includes several diagnostic tools that you should absolutely use. +These tools include an intuitive `browser dashboard +`__ as well as a dedicated +`API for collecting performance profiles +`__. + +No matter the workflow, using the dashboard is strongly recommended. +It provides a visual representation of the worker resources and compute +progress. It also shows basic GPU memory and utilization metrics (under +the ``GPU`` tab). To visualize more detailed GPU metrics in JupyterLab, +use `NVDashboard `__. + + +Enable cuDF spilling +~~~~~~~~~~~~~~~~~~~~ + +When using Dask cuDF for classic ETL workloads, it is usually best +to enable `native spilling support in cuDF +`__. +When using :func:`LocalCUDACluster`, this is easily accomplished by +setting ``enable_cudf_spill=True``. + +When a Dask cuDF workflow includes conversion between DataFrame and Array +representations, native cuDF spilling may be insufficient. For these cases, +`JIT-unspill `__ +is likely to produce better protection from out-of-memory (OOM) errors. +Please see `Dask-CUDA's spilling documentation +`__ for further details +and guidance. + +Use RMM +~~~~~~~ + +Memory allocations in cuDF are significantly faster and more efficient when +the `RAPIDS Memory Manager (RMM) `__ +library is configured appropriately on worker processes. In most cases, the best way to manage +memory is by initializing an RMM pool on each worker before executing a +workflow. When using :func:`LocalCUDACluster`, this is easily accomplished +by setting ``rmm_pool_size`` to a large fraction (e.g. ``0.9``). + +See the `Dask-CUDA memory-management documentation +`__ +for more details. + +Use the Dask DataFrame API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Although Dask cuDF provides a public ``dask_cudf`` Python module, we +strongly recommended that you use the CPU/GPU portable ``dask.dataframe`` +API instead. Simply `use the Dask configuration system +`__ +to set the ``"dataframe.backend"`` option to ``"cudf"``, and the +``dask_cudf`` module will be imported and used implicitly. + +Be sure to use the :func:`to_backend` method if you need to convert +between the different DataFrame backends. For example:: + + df = df.to_backend("pandas") # This gives us a pandas-backed collection + +.. note:: + Although :func:`to_backend` makes it easy to move data between pandas + and cuDF, repetitive CPU-GPU data movement can degrade performance + significantly. For optimal results, keep your data on the GPU as much + as possible. + +Avoid eager execution +~~~~~~~~~~~~~~~~~~~~~ + +Although Dask DataFrame collections are lazy by default, there are several +notable methods that will result in the immediate execution of the +underlying task graph: + +:func:`compute`: Calling ``ddf.compute()`` will materialize the result of +``ddf`` and return a single cuDF object. This is done by executing the entire +task graph associated with ``ddf`` and concatenating its partitions in +local memory on the client process. + +.. note:: + Never call :func:`compute` on a large collection that cannot fit comfortably + in the memory of a single GPU! + +:func:`persist`: Like :func:`compute`, calling ``ddf.persist()`` will +execute the entire task graph associated with ``ddf``. The most important +difference is that the computed partitions will remain in distributed +worker memory instead of being concatenated together on the client process. +Another difference is that :func:`persist` will return immediately when +executing on a distributed cluster. If you need a blocking synchronization +point in your workflow, simply use the :func:`wait` function:: + + ddf = ddf.persist() + wait(ddf) + +.. note:: + Avoid calling :func:`persist` on a large collection that cannot fit comfortably + in global worker memory. If the total sum of the partition sizes is larger + than the sum of all GPU memory, calling persist will result in significant + spilling from device memory. If the individual partition sizes are large, this + is likely to produce an OOM error. + +:func:`len` / :func:`head` / :func:`tail`: Although these operations are used +often within pandas/cuDF code to quickly inspect data, it is best to avoid +them in Dask DataFrame. In most cases, these operations will execute some or all +of the underlying task graph to materialize the collection. + +:func:`sort_values` / :func:`set_index` : These operations both require Dask to +eagerly collect quantile information about the column(s) being targeted by the +global sort operation. See `Avoid Sorting`__ for notes on sorting considerations. + +.. note:: + When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the + global collection does not **need** to be sorted by the new index. + +Avoid Sorting +~~~~~~~~~~~~~ + +`The design of Dask DataFrame `__ +makes it advantageous to work with data that is already sorted along its index at +creation time. For most other cases, it is best to avoid sorting unless the logic +of the workflow makes global ordering absolutely necessary. + +If the purpose of a :func:`sort_values` operation is to ensure that all unique +values in ``by`` will be moved to the same output partition, then `shuffle +`__ +is often the better option. + + +Reading Data +------------ + +Tune the partition size +~~~~~~~~~~~~~~~~~~~~~~~ + +The ideal partition size is usually between 1/32 and 1/8 the memory +capacity of a single GPU. Increasing the partition size will typically +reduce the number of tasks in your workflow and improve the GPU utilization +for each task. However, if the partitions are too large, the risk of OOM +errors can become significant. + +.. note:: + As a general rule of thumb, start with 1/32-1/16 for shuffle-intensive workflows + (e.g. large-scale sorting and joining), and 1/16-1/8 otherwise. For pathologically + skewed data distributions, it may be necessary to target 1/64 or smaller. + This rule of thumb comes from anecdotal optimization and OOM-debugging + experience. Since every workflow is different, choosing the best partition + size is both an art and a science. + +The easiest way to tune the partition size is when the DataFrame collection +is first created by a function like :func:`read_parquet`, :func:`read_csv`, +or :func:`from_map`. For example, both :func:`read_parquet` and :func:`read_csv` +expose a ``blocksize`` argument for adjusting the maximum partition size. + +If the partition size cannot be tuned effectively at creation time, the +`repartition `__ +method can be used as a last resort. + + +Use Parquet +~~~~~~~~~~~ + +`Parquet `__ is the recommended +file format for Dask cuDF. It provides efficient columnar storage and enables +Dask to perform valuable query optimizations like column projection and +predicate pushdown. + +The most important arguments to :func:`read_parquet` are ``blocksize`` and +``aggregate_files``: + +``blocksize``: Use this argument to specify the maximum partition size. +The default is `"256 MiB"`, but larger values are usually more performant +on GPUs with more than 8 GiB of memory. Dask will use the ``blocksize`` +value to map a discrete number of Parquet row-groups (or files) to each +output partition. This mapping will only account for the uncompressed +storage size of each row group, which is usually smaller than the +correspondng ``cudf.DataFrame``. + +``aggregate_files``: Use this argument to specify whether Dask should +map multiple files to the same DataFrame partition. The default is +``False``, but ``aggregate_files=True`` is usually more performant when +the dataset contains many files that are smaller than half of ``blocksize``. + +If you know that your files correspond to a reasonable partition size +before splitting or aggregation, set ``blocksize=None`` to disallow +file splitting. In the absence of column-projection pushdown, this will +result in a simple 1-to-1 mapping between files and output partitions. + +.. note:: + If your workflow requires a strict 1-to-1 mapping between files and + partitions, use :func:`from_map` to manually construct your partitions + with ``cudf.read_parquet``. When :func:`dd.read_parquet` is used, + query-planning optimizations may automatically aggregate distinct files + into the same partition (even when ``aggregate_files=False``). + +.. note:: + Metadata collection can be extremely slow when reading from remote + storage (e.g. S3 and GCS). When reading many remote files that all + correspond to a reasonable partition size, use ``blocksize=None`` + to avoid unnecessary metadata collection. + + +Use :func:`from_map` +~~~~~~~~~~~~~~~~~~~~ + +To implement custom DataFrame-creation logic that is not covered by +existing APIs (like :func:`read_parquet`), use :func:`dask.dataframe.from_map` +whenever possible. The :func:`from_map` API has several advantages +over :func:`from_delayed`: + +* It allows proper lazy execution of your custom logic +* It enables column projection (as long as the mapped function supports a ``columns`` key-word argument) + +See the `from_map API documentation `__ +for more details. + +.. note:: + Whenever possible, be sure to specify the ``meta`` argument to + :func:`from_map`. If this argument is excluded, Dask will need to + materialize the first partition eagerly. If a large RMM pool is in + use on the first visible device, this eager execution on the client + may lead to an OOM error. + + +Sorting, Joining, and Grouping +------------------------------ + +Sorting, joining, and grouping operations all have the potential to +require the global shuffling of data between distinct partitions. +When the initial data fits comfortably in global GPU memory, these +"all-to-all" operations are typically bound by worker-to-worker +communication. When the data is larger than global GPU memory, the +bottleneck is typically device-to-host memory spilling. + +Although every workflow is different, the following guidelines +are often recommended: + +* `Use a distributed cluster with Dask-CUDA workers `_ +* `Use native cuDF spilling whenever possible `_ +* Avoid shuffling whenever possible + * Use ``split_out=1`` for low-cardinality groupby aggregations + * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``) +* `Use UCX `__ if communication is a bottleneck. + +.. note:: + UCX enables Dask-CUDA workers to communicate using high-performance + tansport technologies like `NVLink `__ + and Infiniband. Without UCX, inter-process communication will rely + on TCP sockets. + + +User-defined functions +---------------------- + +Most real-world Dask DataFrame workflows use `map_partitions +`__ +to map user-defined functions across every partition of the underlying data. +This API is a fantastic way to apply custom operations in an intuitive and +scalable way. With that said, the :func:`map_partitions` method will produce +an opaque DataFrame expression that blocks the query-planning `optimizer +`__ from performing +useful optimizations (like projection and filter pushdown). + +Since column-projection pushdown is often the most effective optimization, +it is important to select the necessary columns both before and after calling +:func:`map_partitions`. You can also add explicit filter operations to further +mitigate the loss of filter pushdown. diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 7fe6cbd45fa..23ca7e49753 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -15,7 +15,7 @@ as the ``"cudf"`` dataframe backend for .. note:: Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must also deploy a - `dask.distributed ` cluster + `dask.distributed `__ cluster to leverage multiple GPUs. We strongly recommend using `Dask-CUDA `__ to simplify the setup of the cluster, taking advantage of all features of the GPU @@ -29,6 +29,10 @@ minutes to Dask by `10 minutes to cuDF and Dask cuDF `__. +After reviewing the sections below, please see the +:ref:`Best Practices ` page for further guidance on +using Dask cuDF effectively. + Using Dask cuDF --------------- @@ -36,7 +40,7 @@ Using Dask cuDF The Dask DataFrame API (Recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Simply use the `Dask configuration ` system to +Simply use the `Dask configuration `__ system to set the ``"dataframe.backend"`` option to ``"cudf"``. From Python, this can be achieved like so:: @@ -50,14 +54,14 @@ environment before running your code. Once this is done, the public Dask DataFrame API will leverage ``cudf`` automatically when a new DataFrame collection is created from an on-disk format using any of the following ``dask.dataframe`` -functions:: +functions: -* :func:`dask.dataframe.read_parquet` -* :func:`dask.dataframe.read_json` -* :func:`dask.dataframe.read_csv` -* :func:`dask.dataframe.read_orc` -* :func:`dask.dataframe.read_hdf` -* :func:`dask.dataframe.from_dict` +* :func:`read_parquet` +* :func:`read_json` +* :func:`read_csv` +* :func:`read_orc` +* :func:`read_hdf` +* :func:`from_dict` For example:: @@ -112,8 +116,8 @@ performance benefit over the CPU/GPU-portable ``dask.dataframe`` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section). -The explicit Dask cuDF API -~~~~~~~~~~~~~~~~~~~~~~~~~~ +Query Planning +~~~~~~~~~~~~~~ Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the ``"dataframe.query-planning"`` configuration is set to diff --git a/java/ci/README.md b/java/ci/README.md index ccb9efb50b6..95b93698cae 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.10 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.12 ``` ### Build cuDF jar with devtoolset @@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-24.10.0-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-24.12.0-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index e4f1cdf64e7..450cfbdbc84 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 24.10.0-SNAPSHOT + 24.12.0-SNAPSHOT cudfjni diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index c8308ca17ec..2bb74c3e3b1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -38,6 +38,8 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean allowLeadingZeros; private final boolean allowNonNumericNumbers; private final boolean allowUnquotedControlChars; + private final boolean cudfPruneSchema; + private final byte lineDelimiter; private JSONOptions(Builder builder) { super(builder); @@ -52,6 +54,16 @@ private JSONOptions(Builder builder) { allowLeadingZeros = builder.allowLeadingZeros; allowNonNumericNumbers = builder.allowNonNumericNumbers; allowUnquotedControlChars = builder.allowUnquotedControlChars; + cudfPruneSchema = builder.cudfPruneSchema; + lineDelimiter = builder.lineDelimiter; + } + + public boolean shouldCudfPruneSchema() { + return cudfPruneSchema; + } + + public byte getLineDelimiter() { + return lineDelimiter; } public boolean isDayFirst() { @@ -123,6 +135,22 @@ public static final class Builder extends ColumnFilterOptions.Builder Byte.MAX_VALUE) { + throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter); + } + lineDelimiter = (byte)delimiter; + return this; + } + /** * Should json validation be strict or not */ diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 09da43374ae..6d370ca27b2 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -258,7 +258,9 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, - boolean allowUnquotedControl) throws CudfException; + boolean allowUnquotedControl, + boolean pruneColumns, + byte lineDelimiter) throws CudfException; private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames, int[] dTypeIds, int[] dTypeScales, @@ -272,6 +274,8 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + boolean pruneColumns, + byte lineDelimiter, long dsHandle) throws CudfException; private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines, @@ -284,6 +288,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + byte lineDelimiter, long dsHandle) throws CudfException; private static native long readAndInferJSON(long address, long length, @@ -297,7 +302,8 @@ private static native long readAndInferJSON(long address, long length, boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, - boolean allowUnquotedControl) throws CudfException; + boolean allowUnquotedControl, + byte lineDelimiter) throws CudfException; /** * Read in Parquet formatted data. @@ -1308,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp * @return the file parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, File path) { + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta( readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -1321,7 +1331,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars()))) { + opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, -1); } @@ -1404,7 +1416,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars())); + opts.unquotedControlChars(), + opts.getLineDelimiter())); } /** @@ -1426,6 +1439,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + opts.getLineDelimiter(), dsHandle)); return twm; } finally { @@ -1465,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert len > 0; assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, @@ -1479,7 +1497,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars()))) { + opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, emptyRowCount); } } @@ -1505,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { */ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1518,6 +1542,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter(), dsHandle))) { return gatherJSONColumns(schema, twm, emptyRowCount); } finally { diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 92e213bcb60..0f77da54152 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jbyte line_delimiter, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1646,8 +1647,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(false); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1676,7 +1679,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) + jboolean allow_unquoted_control, + jbyte line_delimiter) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1700,6 +1704,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .normalize_whitespace(static_cast(normalize_whitespace)) .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) + .prune_columns(false) + .delimiter(static_cast(line_delimiter)) .keep_quotes(keep_quotes); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -1814,6 +1820,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jboolean prune_columns, + jbyte line_delimiter, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1848,8 +1856,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(prune_columns); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1908,7 +1918,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) + jboolean allow_unquoted_control, + jboolean prune_columns, + jbyte line_delimiter) { bool read_buffer = true; if (buffer == 0) { @@ -1957,8 +1969,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(prune_columns); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 830f2b33b32..c7fcb1756b6 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -40,7 +40,6 @@ import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import java.io.*; @@ -656,6 +655,24 @@ void testJSONValidationUnquotedControl() { } } + private static final byte[] CR_JSON_TEST_BUFFER = ("{\"a\":\"12\n3\"}\0" + + "{\"a\":\"AB\nC\"}\0").getBytes(StandardCharsets.UTF_8); + + @Test + void testReadJSONDelim() { + Schema schema = Schema.builder().addColumn(DType.STRING, "a").build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withLineDelimiter('\0') + .build(); + try (Table expected = new Table.TestBuilder() + .column("12\n3", "AB\nC") + .build(); + Table found = Table.readJSON(schema, opts, CR_JSON_TEST_BUFFER)) { + assertTablesAreEqual(expected, found); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" + diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini index db24415ef9e..187d91996b2 100644 --- a/python/cudf/benchmarks/pytest.ini +++ b/python/cudf/benchmarks/pytest.ini @@ -6,3 +6,4 @@ python_classes = Bench python_functions = bench_* markers = pandas_incompatible: mark a benchmark that cannot be run with pandas +addopts = --tb=native diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index e27c595edda..99e4c21df8a 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -599,7 +599,6 @@ cdef class Column: children=tuple(children) ) - # TODO: Actually support exposed data pointers. @staticmethod def from_pylibcudf( col, bint data_ptr_exposed=False @@ -616,7 +615,7 @@ cdef class Column: col : pylibcudf.Column The object to copy. data_ptr_exposed : bool - This parameter is not yet supported + Whether the data buffer is exposed. Returns ------- @@ -639,16 +638,18 @@ cdef class Column: dtype = dtype_from_pylibcudf_column(col) return cudf.core.column.build_column( - data=as_buffer(col.data().obj) if col.data() is not None else None, + data=as_buffer( + col.data().obj, exposed=data_ptr_exposed + ) if col.data() is not None else None, dtype=dtype, size=col.size(), mask=as_buffer( - col.null_mask().obj + col.null_mask().obj, exposed=data_ptr_exposed ) if col.null_mask() is not None else None, offset=col.offset(), null_count=col.null_count(), children=tuple([ - Column.from_pylibcudf(child) + Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed) for child in col.children() ]) ) diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index e661059faa3..e6c2d136f0d 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -23,9 +23,9 @@ def concat_columns(object columns): def concat_tables(object tables, bool ignore_index=False): plc_tables = [] for table in tables: - cols = table._data.columns + cols = table._columns if not ignore_index: - cols = table._index._data.columns + cols + cols = table._index._columns + cols plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols])) return data_from_pylibcudf_table( diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 16182e31c08..49714091f46 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -384,7 +384,7 @@ cdef class _CPackedColumns: p.column_names = input_table._column_names p.column_dtypes = {} - for name, col in input_table._data.items(): + for name, col in input_table._column_labels_and_values: if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): p.column_dtypes[name] = col.dtype diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 058e884e08b..9ad96f610b3 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -273,7 +273,7 @@ def read_csv( elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._data.names[index] + col_name = df._column_names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) if names is not None and len(names) and isinstance(names[0], int): diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 483250dd36f..bc5e085ec39 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -17,6 +17,8 @@ from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc + @acquire_spill_lock() def add_months(Column col, Column months): @@ -38,43 +40,9 @@ def add_months(Column col, Column months): @acquire_spill_lock() def extract_datetime_component(Column col, object field): - - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - if field == "year": - c_result = move(libcudf_datetime.extract_year(col_view)) - elif field == "month": - c_result = move(libcudf_datetime.extract_month(col_view)) - elif field == "day": - c_result = move(libcudf_datetime.extract_day(col_view)) - elif field == "weekday": - c_result = move(libcudf_datetime.extract_weekday(col_view)) - elif field == "hour": - c_result = move(libcudf_datetime.extract_hour(col_view)) - elif field == "minute": - c_result = move(libcudf_datetime.extract_minute(col_view)) - elif field == "second": - c_result = move(libcudf_datetime.extract_second(col_view)) - elif field == "millisecond": - c_result = move( - libcudf_datetime.extract_millisecond_fraction(col_view) - ) - elif field == "microsecond": - c_result = move( - libcudf_datetime.extract_microsecond_fraction(col_view) - ) - elif field == "nanosecond": - c_result = move( - libcudf_datetime.extract_nanosecond_fraction(col_view) - ) - elif field == "day_of_year": - c_result = move(libcudf_datetime.day_of_year(col_view)) - else: - raise ValueError(f"Invalid datetime field: '{field}'") - - result = Column.from_unique_ptr(move(c_result)) + result = Column.from_pylibcudf( + plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) + ) if field == "weekday": # Pandas counts Monday-Sunday as 0-6 diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index b1900138d94..564daefbae2 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -179,7 +179,7 @@ cdef update_struct_field_names( ): # Deprecated, remove in favor of add_col_struct_names # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._data.items()): + for i, (name, col) in enumerate(table._column_labels_and_values): table._data[name] = update_column_struct_field_names( col, schema_info[i] ) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5ee15d0e409..59cb8d51440 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, ) from pylibcudf.libcudf.types cimport size_type @@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width): ) return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash64(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash64( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index a0155671a26..fa2690c7f21 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -235,16 +235,16 @@ cdef object _process_metadata(object df, df._index = idx elif set(index_col).issubset(names): index_data = df[index_col] - actual_index_names = list(index_col_names.values()) - if len(index_data._data) == 1: + actual_index_names = iter(index_col_names.values()) + if index_data._num_columns == 1: idx = cudf.Index._from_column( - index_data._data.columns[0], - name=actual_index_names[0] + index_data._columns[0], + name=next(actual_index_names) ) else: idx = cudf.MultiIndex.from_frame( index_data, - names=actual_index_names + names=list(actual_index_names) ) df.drop(columns=index_col, inplace=True) df._index = idx @@ -252,7 +252,7 @@ cdef object _process_metadata(object df, if use_pandas_metadata: df.index.names = index_col - if len(df._data.names) == 0 and column_index_type is not None: + if df._num_columns == 0 and column_index_type is not None: df._data.label_dtype = cudf.dtype(column_index_type) return df @@ -438,7 +438,7 @@ def write_parquet( object statistics="ROWGROUP", object metadata_file_path=None, object int96_timestamps=False, - object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + object row_group_size_bytes=None, object row_group_size_rows=None, object max_page_size_bytes=None, object max_page_size_rows=None, @@ -616,9 +616,9 @@ cdef class ParquetWriter: Name of the compression to use. Use ``None`` for no compression. statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' Level at which column statistics should be included in file. - row_group_size_bytes: int, default 134217728 + row_group_size_bytes: int, default ``uint64 max`` Maximum size of each stripe of the output. - By default, 134217728 (128MB) will be used. + By default, a virtually infinite size equal to ``uint64 max`` will be used. row_group_size_rows: int, default 1000000 Maximum number of rows of each stripe of the output. By default, 1000000 (10^6 rows) will be used. @@ -661,11 +661,11 @@ cdef class ParquetWriter: def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", - int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - int row_group_size_rows=1000000, - int max_page_size_bytes=524288, - int max_page_size_rows=20000, - int max_dictionary_size=1048576, + size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + size_type row_group_size_rows=1000000, + size_t max_page_size_bytes=524288, + size_type max_page_size_rows=20000, + size_t max_dictionary_size=1048576, bool use_dictionary=True, bool store_schema=False): filepaths_or_buffers = ( diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 8d463829a19..60a6795a402 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -20,13 +20,7 @@ from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( to_booleans as cpp_to_booleans, ) from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - from_timestamps as cpp_from_timestamps, is_timestamp as cpp_is_timestamp, - to_timestamps as cpp_to_timestamps, -) -from pylibcudf.libcudf.strings.convert.convert_durations cimport ( - from_durations as cpp_from_durations, - to_durations as cpp_to_durations, ) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, @@ -48,8 +42,12 @@ from pylibcudf.libcudf.types cimport data_type, type_id from cudf._lib.types cimport underlying_type_t_type_id +import pylibcudf as plc + import cudf +from cudf._lib.types cimport dtype_to_pylibcudf_type + def floating_to_string(Column input_col): cdef column_view input_column_view = input_col.view() @@ -522,19 +520,14 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef column_view input_column_view = input_col.view() cdef string c_timestamp_format = format.encode("UTF-8") - cdef column_view input_strings_names = names.view() - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_timestamps( - input_column_view, - c_timestamp_format, - input_strings_names)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_datetime.from_timestamps( + input_col.to_pylibcudf(mode="read"), + c_timestamp_format, + names.to_pylibcudf(mode="read") + ) + ) def timestamp2int(Column input_col, dtype, format): @@ -551,23 +544,15 @@ def timestamp2int(Column input_col, dtype, format): A Column with string represented in date-time format """ - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype] + dtype = dtype_to_pylibcudf_type(dtype) + cdef string c_timestamp_format = format.encode('UTF-8') + return Column.from_pylibcudf( + plc.strings.convert.convert_datetime.to_timestamps( + input_col.to_pylibcudf(mode="read"), + dtype, + c_timestamp_format ) ) - cdef data_type out_type = data_type(tid) - cdef string c_timestamp_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_timestamps( - input_column_view, - out_type, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) def istimestamp(Column input_col, str format): @@ -613,23 +598,15 @@ def timedelta2int(Column input_col, dtype, format): A Column with string represented in TimeDelta format """ - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype] + dtype = dtype_to_pylibcudf_type(dtype) + cdef string c_timestamp_format = format.encode('UTF-8') + return Column.from_pylibcudf( + plc.strings.convert.convert_durations.to_durations( + input_col.to_pylibcudf(mode="read"), + dtype, + c_timestamp_format ) ) - cdef data_type out_type = data_type(tid) - cdef string c_duration_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_durations( - input_column_view, - out_type, - c_duration_format)) - - return Column.from_unique_ptr(move(c_result)) def int2timedelta(Column input_col, str format): @@ -647,16 +624,13 @@ def int2timedelta(Column input_col, str format): """ - cdef column_view input_column_view = input_col.view() cdef string c_duration_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_durations( - input_column_view, - c_duration_format)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_durations.from_durations( + input_col.to_pylibcudf(mode="read"), + c_duration_format + ) + ) def int2ip(Column input_col): diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 47a194c4fda..4bf8a9b1a8f 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams, @@ -6,7 +6,12 @@ hash_character_ngrams, ) from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import minhash, minhash64 +from cudf._lib.nvtext.minhash import ( + minhash, + minhash64, + word_minhash, + word_minhash64, +) from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces from cudf._lib.nvtext.replace import filter_tokens, replace_tokens diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 82f5e06c547..03b4887f200 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -1,27 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.contains cimport ( - count_re as cpp_count_re, - like as cpp_like, - matches_re as cpp_matches_re, -) -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar from pylibcudf.strings import contains from pylibcudf.strings.regex_program import RegexProgram @@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with count of occurrences of `reg_ex` in each string of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_count_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with each value True if the string matches `reg_ex` regular expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_matches_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape): Returns a Column with each value True if the string matches the `py_pattern` like expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef DeviceScalar pattern = py_pattern.device_value - cdef DeviceScalar escape = py_escape.device_value - - cdef const string_scalar* scalar_ptn = ( - pattern.get_raw_ptr() - ) - cdef const string_scalar* scalar_esc = ( - escape.get_raw_ptr() + plc_column = contains.like( + source_strings.to_pylibcudf(mode="read"), + py_pattern.device_value.c_value, + py_escape.device_value.c_value, ) - - with nogil: - c_result = move(cpp_like( - source_view, - scalar_ptn[0], - scalar_esc[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 63f4d57e562..5bf336f4f3c 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -1,21 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags): The returning data contains one row for each subject string, and one column for each group. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_extract( - source_view, - dereference(c_prog) - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) + plc_result = plc.strings.extract.extract( + source_strings.to_pylibcudf(mode="read"), prog ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 3cf2084e30a..0e758d5b322 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -1,21 +1,13 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def findall(Column source_strings, object pattern, uint32_t flags): @@ -23,18 +15,11 @@ def findall(Column source_strings, object pattern, uint32_t flags): Returns data with all non-overlapping matches of `pattern` in each string of `source_strings` as a lists column. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_findall( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = plc.strings.regex_program.RegexProgram.create( + str(pattern), flags + ) + plc_result = plc.strings.findall.findall( + source_strings.to_pylibcudf(mode="read"), + prog, + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx index 42fcfa5d94e..43649d4defe 100644 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ b/python/cudf/cudf/_lib/strings/repeat.pyx @@ -1,17 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def repeat_scalar(Column source_strings, @@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view repeats_view = repeats.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index acf52cb7b9f..38ecb21a94c 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc @acquire_spill_lock() @@ -25,23 +26,14 @@ def strip(Column source_strings, """ cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + return Column.from_pylibcudf( + plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.SideType.BOTH, + repl.c_value + ) ) - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.BOTH, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def lstrip(Column source_strings, diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index baa08a545ec..40d0c9eac3a 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -3,41 +3,26 @@ from numba.np import numpy_support import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from cudf.core._internals.expressions import parse_expression from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.utils import cudautils from cython.operator cimport dereference -from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.string cimport string from libcpp.utility cimport move cimport pylibcudf.libcudf.transform as libcudf_transform from pylibcudf cimport transform as plc_transform from pylibcudf.expressions cimport Expression from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.expressions cimport expression -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport ( - bitmask_type, - data_type, - size_type, - type_id, -) -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.types cimport underlying_type_t_type_id -from cudf._lib.utils cimport ( - columns_from_unique_ptr, - data_from_table_view, - table_view_from_columns, -) +from cudf._lib.utils cimport table_view_from_columns + +import pylibcudf as plc @acquire_spill_lock() @@ -46,17 +31,8 @@ def bools_to_mask(Column col): Given an int8 (boolean) column, compress the data from booleans to bits and return a Buffer """ - cdef column_view col_view = col.view() - cdef pair[unique_ptr[device_buffer], size_type] cpp_out - cdef unique_ptr[device_buffer] up_db - - with nogil: - cpp_out = move(libcudf_transform.bools_to_mask(col_view)) - up_db = move(cpp_out.first) - - rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_buffer(rmm_db) - return buf + mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read")) + return as_buffer(mask) @acquire_spill_lock() @@ -68,22 +44,15 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): if not isinstance(mask_buffer, cudf.core.buffer.Buffer): raise TypeError("mask_buffer is not an instance of " "cudf.core.buffer.Buffer") - cdef bitmask_type* bit_mask = ( - mask_buffer.get_ptr(mode="read") + plc_column = plc_transform.mask_to_bools( + mask_buffer.get_ptr(mode="read"), begin_bit, end_bit ) - - cdef unique_ptr[column] result - with nogil: - result = move( - libcudf_transform.mask_to_bools(bit_mask, begin_bit, end_bit) - ) - - return Column.from_unique_ptr(move(result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() def nans_to_nulls(Column input): - (mask, _) = plc_transform.nans_to_nulls( + mask, _ = plc_transform.nans_to_nulls( input.to_pylibcudf(mode="read") ) return as_buffer(mask) @@ -91,80 +60,45 @@ def nans_to_nulls(Column input): @acquire_spill_lock() def transform(Column input, op): - cdef column_view c_input = input.view() - cdef string c_str - cdef type_id c_tid - cdef data_type c_dtype - nb_type = numpy_support.from_dtype(input.dtype) nb_signature = (nb_type,) compiled_op = cudautils.compile_udf(op, nb_signature) - c_str = compiled_op[0].encode('UTF-8') np_dtype = cudf.dtype(compiled_op[1]) - try: - c_tid = ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - np_dtype - ] - ) - c_dtype = data_type(c_tid) - - except KeyError: - raise TypeError( - "Result of window function has unsupported dtype {}" - .format(np_dtype) - ) - - with nogil: - c_output = move(libcudf_transform.transform( - c_input, - c_str, - c_dtype, - True - )) - - return Column.from_unique_ptr(move(c_output)) + plc_column = plc_transform.transform( + input.to_pylibcudf(mode="read"), + compiled_op[0], + plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), + True + ) + return Column.from_pylibcudf(plc_column) def table_encode(list source_columns): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef pair[unique_ptr[table], unique_ptr[column]] c_result - - with nogil: - c_result = move(libcudf_transform.encode(c_input)) + plc_table, plc_column = plc_transform.encode( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) + ) return ( - columns_from_unique_ptr(move(c_result.first)), - Column.from_unique_ptr(move(c_result.second)) + [Column.from_pylibcudf(col) for col in plc_table.columns()], + Column.from_pylibcudf(plc_column) ) def one_hot_encode(Column input_column, Column categories): - cdef column_view c_view_input = input_column.view() - cdef column_view c_view_categories = categories.view() - cdef pair[unique_ptr[column], table_view] c_result - - with nogil: - c_result = move( - libcudf_transform.one_hot_encode(c_view_input, c_view_categories) - ) - - # Notice, the data pointer of `owner` has been exposed - # through `c_result.second` at this point. - owner = Column.from_unique_ptr( - move(c_result.first), data_ptr_exposed=True - ) - - pylist_categories = categories.to_arrow().to_pylist() - encodings, _ = data_from_table_view( - move(c_result.second), - owner=owner, - column_names=[ - x if x is not None else '' for x in pylist_categories - ] + plc_table = plc_transform.one_hot_encode( + input_column.to_pylibcudf(mode="read"), + categories.to_pylibcudf(mode="read"), ) - return encodings + result_columns = [ + Column.from_pylibcudf(col, data_ptr_exposed=True) + for col in plc_table.columns() + ] + result_labels = [ + x if x is not None else '' + for x in categories.to_arrow().to_pylist() + ] + return dict(zip(result_labels, result_columns)) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index cae28d02ef4..8660cca9322 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*: If True, don't include the index in the columns. """ return table_view_from_columns( - tbl._index._data.columns + tbl._data.columns + tbl._index._columns + tbl._columns if not ignore_index and tbl._index is not None - else tbl._data.columns + else tbl._columns ) @@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index): index_descriptors = [] columns_to_convert = list(table._columns) # Columns - for name, col in table._data.items(): + for name, col in table._column_labels_and_values: if cudf.get_option("mode.pandas_compatible"): # in pandas-compat mode, non-string column names are stringified. col_names.append(str(name)) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ff114474aa4..a6abd63d042 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1951,7 +1951,7 @@ def drop_duplicates( return self._from_columns_like_self( drop_duplicates( list(self._columns), - keys=range(len(self._data)), + keys=range(len(self._columns)), keep=keep, nulls_are_equal=nulls_are_equal, ), diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 16e6908f308..4463e3280df 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -623,11 +623,9 @@ def extract( "unsupported value for `flags` parameter" ) - data, _ = libstrings.extract(self._column, pat, flags) + data = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: - data = next(iter(data.values())) - else: - data = data + _, data = data.popitem() return self._return_or_inplace(data, expand=expand) def contains( @@ -5349,6 +5347,76 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint32. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + >>> ls.str.word_minhash(seeds=seeds) + 0 [21141582, 1232889953, 1268336794] + 1 [962346254, 2321233602, 1354839212] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint32, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash(self._column, seeds_column) + ) + + def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + This function generates 2 uint64 values but only the first + uint64 value is used. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint64. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + >>> ls.str.word_minhash64(seeds) + 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] + 1 [5240044617220523711, 5847101123925041457, 153762819128779913] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint64, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash64(self._column, seeds_column) + ) + def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ Compute the Jaccard index between this column and the given diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 09b0f453692..bc093fdaa9a 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None: self.set_by_label(key, value) def __delitem__(self, key: abc.Hashable) -> None: - old_ncols = len(self._data) + old_ncols = len(self) del self._data[key] - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def __len__(self) -> int: @@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]: @property def nlevels(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 if not self.multiindex: return 1 @@ -226,7 +226,7 @@ def name(self) -> abc.Hashable: @cached_property def nrows(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 else: return len(next(iter(self.values()))) @@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: Parameters ---------- old_ncols: int - len(self._data) before self._data was modified + len(self) before self._data was modified new_ncols: int - len(self._data) after self._data was modified + len(self) after self._data was modified """ cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: @@ -335,7 +335,7 @@ def insert( if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") - old_ncols = len(self._data) + old_ncols = len(self) if loc == -1: loc = old_ncols elif not (0 <= loc <= old_ncols): @@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple: tuple """ if isinstance(index, slice): - start, stop, step = index.indices(len(self._data)) + start, stop, step = index.indices(len(self)) return self.names[start:stop:step] elif pd.api.types.is_integer(index): return (self.names[index],) @@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None: if len(self) > 0 and len(value) != self.nrows: raise ValueError("All columns must be of equal length") - old_ncols = len(self._data) + old_ncols = len(self) self._data[key] = value - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def _select_by_label_list_like(self, key: tuple) -> Self: @@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None: if level < 0: level += self.nlevels - old_ncols = len(self._data) + old_ncols = len(self) self._data = { _remove_key_level(key, level): value # type: ignore[arg-type] for key, value in self._data.items() } - new_ncols = len(self._data) + new_ncols = len(self) self._level_names = ( self._level_names[:level] + self._level_names[level + 1 :] ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 58a16a6d504..16b0aa95c35 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg): return False @_performance_tracking - def _downcast_to_series(self, df, arg): + def _downcast_to_series(self, df: DataFrame, arg): """ "Downcast" from a DataFrame to a Series based on Pandas indexing rules @@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg): # take series along the axis: if axis == 1: - return df[df._data.names[0]] + return df[df._column_names[0]] else: if df._num_columns > 0: dtypes = df.dtypes.values.tolist() normalized_dtype = np.result_type(*dtypes) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: df[name] = col.astype(normalized_dtype) sr = df.T - return sr[sr._data.names[0]] + return sr[sr._column_names[0]] class _DataFrameLocIndexer(_DataFrameIndexer): @@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg): and len(arg) > 1 and is_scalar(arg[1]) ): - return result._data.columns[0].element_indexing(0) + return result._columns[0].element_indexing(0) return result else: if isinstance(arg[0], slice): @@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg): else: tmp_col_name = str(uuid4()) cantor_name = "_" + "_".join( - map(str, columns_df._data.names) + map(str, columns_df._column_names) ) if columns_df._data.multiindex: # column names must be appropriate length tuples @@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value): else column.column_empty_like( col, masked=True, newsize=length ) - for key, col in self._data.items() + for key, col in self._column_labels_and_values ) self._data = self._data._from_columns_like_self( new_columns, verify=False @@ -1494,8 +1494,8 @@ def __delitem__(self, name): @_performance_tracking def memory_usage(self, index=True, deep=False) -> cudf.Series: - mem_usage = [col.memory_usage for col in self._data.columns] - names = [str(name) for name in self._data.names] + mem_usage = [col.memory_usage for col in self._columns] + names = [str(name) for name in self._column_names] if index: mem_usage.append(self.index.memory_usage()) names.append("Index") @@ -1725,7 +1725,7 @@ def _concat( [] if are_all_range_index or (ignore_index and not empty_has_index) - else list(f.index._data.columns) + else list(f.index._columns) ) + [f._data[name] if name in f._data else None for name in names] for f in objs @@ -1808,7 +1808,7 @@ def _concat( out.index.dtype, cudf.CategoricalDtype ): out = out.set_index(out.index) - for name, col in out._data.items(): + for name, col in out._column_labels_and_values: out._data[name] = col._with_type_metadata( tables[0]._data[name].dtype ) @@ -1831,13 +1831,13 @@ def astype( errors: Literal["raise", "ignore"] = "raise", ): if is_dict_like(dtype): - if len(set(dtype.keys()) - set(self._data.names)) > 0: + if len(set(dtype.keys()) - set(self._column_names)) > 0: raise KeyError( "Only a column name can be used for the " "key in a dtype mappings argument." ) else: - dtype = {cc: dtype for cc in self._data.names} + dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) def _clean_renderable_dataframe(self, output): @@ -2601,7 +2601,7 @@ def equals(self, other) -> bool: # If all other checks matched, validate names. if ret: for self_name, other_name in zip( - self._data.names, other._data.names + self._column_names, other._column_names ): if self_name != other_name: ret = False @@ -2676,7 +2676,7 @@ def columns(self, columns): ) self._data = ColumnAccessor( - data=dict(zip(pd_columns, self._data.columns)), + data=dict(zip(pd_columns, self._columns)), multiindex=multiindex, level_names=level_names, label_dtype=label_dtype, @@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: f"got {len(self)} elements" ) self._data = ColumnAccessor( - data=dict(zip(other.names, self._data.columns)), + data=dict(zip(other.names, self._columns)), multiindex=other.multiindex, rangeindex=other.rangeindex, level_names=other.level_names, @@ -2983,7 +2983,7 @@ def set_index( elif isinstance(col, (MultiIndex, pd.MultiIndex)): if isinstance(col, pd.MultiIndex): col = MultiIndex.from_pandas(col) - data_to_add.extend(col._data.columns) + data_to_add.extend(col._columns) names.extend(col.names) elif isinstance( col, (cudf.Series, cudf.Index, pd.Series, pd.Index) @@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): ) out = [] - for (name, col), other_col in zip(self._data.items(), other_cols): + for (name, col), other_col in zip( + self._column_labels_and_values, other_cols + ): source_col, other_col = _check_and_cast_columns_with_other( source_col=col, other=other_col, @@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): column.column_empty_like( col_data, masked=True, newsize=length ) - for col_data in self._data.values() + for col_data in self._columns ), verify=False, ) @@ -3664,7 +3666,7 @@ def rename( name: col.find_and_replace( to_replace, vals, is_all_na ) - for name, col in self.index._data.items() + for name, col in self.index._column_labels_and_values } ) except OverflowError: @@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - prefix + col_name for col_name in list(self._data.keys()) - ] + out.columns = [prefix + col_name for col_name in self._column_names] return out @_performance_tracking @@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - col_name + suffix for col_name in list(self._data.keys()) - ] + out.columns = [col_name + suffix for col_name in self._column_names] return out @_performance_tracking @@ -4805,7 +4803,7 @@ def _func(x): # pragma: no cover # TODO: naive implementation # this could be written as a single kernel result = {} - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: apply_sr = Series._from_column(col) result[name] = apply_sr.apply(_func)._column @@ -5444,7 +5442,7 @@ def to_pandas( out_index = self.index.to_pandas() out_data = { i: col.to_pandas(nullable=nullable, arrow_type=arrow_type) - for i, col in enumerate(self._data.columns) + for i, col in enumerate(self._columns) } out_df = pd.DataFrame(out_data, index=out_index) @@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table: index = index._as_int_index() index.name = "__index_level_0__" if isinstance(index, MultiIndex): - index_descr = list(index._data.names) + index_descr = index._column_names index_levels = index.levels else: index_descr = ( index.names if index.name is not None else ("index",) ) data = data.copy(deep=False) - for gen_name, col_name in zip(index_descr, index._data.names): + for gen_name, col_name in zip( + index_descr, index._column_names + ): data._insert( data.shape[1], gen_name, @@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table: out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[self[col] for col in self._data.names], + columns_to_convert=[self[col] for col in self._column_names], df=self, column_names=out.schema.names, index_levels=index_levels, @@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): "column_dtypes is currently not supported." ) members = [("index", self.index.dtype)] if index else [] - members += [(col, self[col].dtype) for col in self._data.names] + members += list(self._dtypes) dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: ret["index"] = self.index.to_numpy() - for col in self._data.names: + for col in self._column_names: ret[col] = self[col].to_numpy() return ret @@ -6059,7 +6059,7 @@ def quantile( ) if columns is None: - columns = data_df._data.names + columns = set(data_df._column_names) if isinstance(q, numbers.Number): q_is_number = True @@ -6084,7 +6084,7 @@ def quantile( # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" result = {} - for k in data_df._data.names: + for k in data_df._column_names: if k in columns: ser = data_df[k] res = ser.quantile( @@ -6198,7 +6198,7 @@ def make_false_column_like_self(): if isinstance(values, DataFrame) else {name: values._column for name in self._data} ) - for col, self_col in self._data.items(): + for col, self_col in self._column_labels_and_values: if col in other_cols: other_col = other_cols[col] self_is_cat = isinstance(self_col, CategoricalColumn) @@ -6231,13 +6231,13 @@ def make_false_column_like_self(): else: result[col] = make_false_column_like_self() elif is_dict_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: if name in values: result[name] = col.isin(values[name]) else: result[name] = make_false_column_like_self() elif is_list_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: result[name] = col.isin(values) else: raise TypeError( @@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable else as_column(True, length=len(filtered._data[name])) - for name in filtered._data.names + for name in filtered._column_names } ) mask = mask.all(axis=1) @@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False): length = len(self) return Series._from_column( as_column([length - col.null_count for col in self._columns]), - index=cudf.Index(self._data.names), + index=cudf.Index(self._column_names), ) _SUPPORT_AXIS_LOOKUP = { @@ -6409,7 +6409,7 @@ def _reduce( return source._apply_cupy_method_axis_1(op, **kwargs) else: axis_0_results = [] - for col_label, col in source._data.items(): + for col_label, col in source._column_labels_and_values: try: axis_0_results.append(getattr(col, op)(**kwargs)) except AttributeError as err: @@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared, mask, common_dtype = self._prepare_for_rowwise_op( method, skipna, numeric_only ) - for col in prepared._data.names: + for col in prepared._column_names: if prepared._data[col].nullable: prepared._data[col] = ( prepared._data[col] @@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._data.items(): + for k, col in self._column_labels_and_values: infered_type = cudf_dtype_from_pydata_dtype(col.dtype) if infered_type in inclusion: df._insert(len(df._data), k, col) @@ -6840,7 +6840,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Compute the column indices that serves as the input for # `interleave_columns` column_idx_df = pd.DataFrame( - data=range(len(self._data)), index=named_levels + data=range(self._num_columns), index=named_levels ) column_indices: list[list[int]] = [] @@ -7392,17 +7392,17 @@ def to_struct(self, name=None): ----- Note: a copy of the columns is made. """ - if not all(isinstance(name, str) for name in self._data.names): + if not all(isinstance(name, str) for name in self._column_names): warnings.warn( "DataFrame contains non-string column name(s). Struct column " "requires field name to be string. Non-string column names " "will be casted to string as the field name." ) - fields = {str(name): col.dtype for name, col in self._data.items()} + fields = {str(name): dtype for name, dtype in self._dtypes} col = StructColumn( data=None, dtype=cudf.StructDtype(fields=fields), - children=tuple(col.copy(deep=True) for col in self._data.columns), + children=tuple(col.copy(deep=True) for col in self._columns), size=len(self), offset=0, ) @@ -7984,7 +7984,7 @@ def value_counts( diff = set(subset) - set(self._data) if len(diff) != 0: raise KeyError(f"columns {diff} do not exist") - columns = list(self._data.names) if subset is None else subset + columns = list(self._column_names) if subset is None else subset result = ( self.groupby( by=columns, @@ -8105,7 +8105,7 @@ def func(left, right, output): right._column_names ) elif _is_scalar_or_zero_d_array(right): - for name, col in output._data.items(): + for name, col in output._column_labels_and_values: output._data[name] = col.fillna(value) return output else: @@ -8387,7 +8387,7 @@ def extract_col(df, col): and col not in df.index._data and not isinstance(df.index, MultiIndex) ): - return df.index._data.columns[0] + return df.index._column return df.index._data[col] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b2bc85b13b..98af006f6e5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]: return self._data.columns @property - def _dtypes(self) -> abc.Iterable: - return zip(self._data.names, (col.dtype for col in self._data.columns)) + def _column_labels_and_values( + self, + ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]: + return zip(self._column_names, self._columns) + + @property + def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]: + for label, col in self._column_labels_and_values: + yield label, col.dtype @property def ndim(self) -> int: @@ -87,7 +94,7 @@ def serialize(self): # TODO: See if self._data can be serialized outright header = { "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(tuple(self._data.names)), + "column_names": pickle.dumps(self._column_names), "column_rangeindex": pickle.dumps(self._data.rangeindex), "column_multiindex": pickle.dumps(self._data.multiindex), "column_label_dtype": pickle.dumps(self._data.label_dtype), @@ -156,7 +163,7 @@ def _mimic_inplace( self, result: Self, inplace: bool = False ) -> Self | None: if inplace: - for col in self._data: + for col in self._column_names: if col in result._data: self._data[col]._mimic_inplace( result._data[col], inplace=True @@ -267,7 +274,7 @@ def __len__(self) -> int: def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self: casted = ( col.astype(dtype.get(col_name, col.dtype), copy=copy) - for col_name, col in self._data.items() + for col_name, col in self._column_labels_and_values ) ca = self._data._from_columns_like_self(casted, verify=False) return self._from_data_like_self(ca) @@ -338,9 +345,7 @@ def equals(self, other) -> bool: return all( self_col.equals(other_col, check_dtypes=True) - for self_col, other_col in zip( - self._data.values(), other._data.values() - ) + for self_col, other_col in zip(self._columns, other._columns) ) @_performance_tracking @@ -434,11 +439,9 @@ def to_array( if dtype is None: if ncol == 1: - dtype = next(iter(self._data.values())).dtype + dtype = next(self._dtypes)[1] else: - dtype = find_common_type( - [col.dtype for col in self._data.values()] - ) + dtype = find_common_type([dtype for _, dtype in self._dtypes]) if not isinstance(dtype, numpy.dtype): raise NotImplementedError( @@ -446,12 +449,12 @@ def to_array( ) if self.ndim == 1: - return to_array(self._data.columns[0], dtype) + return to_array(self._columns[0], dtype) else: matrix = module.empty( shape=(len(self), ncol), dtype=dtype, order="F" ) - for i, col in enumerate(self._data.values()): + for i, col in enumerate(self._columns): # TODO: col.values may fail if there is nullable data or an # unsupported dtype. We may want to catch and provide a more # suitable error. @@ -751,7 +754,7 @@ def fillna( filled_columns = [ col.fillna(value[name], method) if name in value else col.copy() - for name, col in self._data.items() + for name, col in self._column_labels_and_values ] return self._mimic_inplace( @@ -988,7 +991,10 @@ def to_arrow(self): index: [[1,2,3]] """ return pa.Table.from_pydict( - {str(name): col.to_arrow() for name, col in self._data.items()} + { + str(name): col.to_arrow() + for name, col in self._column_labels_and_values + } ) @_performance_tracking @@ -1012,7 +1018,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self: See `ColumnBase._with_type_metadata` for more information. """ - for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes): + for (name, col), (_, dtype) in zip( + self._column_labels_and_values, other._dtypes + ): self._data.set_by_label(name, col._with_type_metadata(dtype)) return self @@ -1422,7 +1430,7 @@ def _split(self, splits): """ return [ self._from_columns_like_self( - libcudf.copying.columns_split([*self._data.columns], splits)[ + libcudf.copying.columns_split(list(self._columns), splits)[ split_idx ], self._column_names, @@ -1432,7 +1440,7 @@ def _split(self, splits): @_performance_tracking def _encode(self): - columns, indices = libcudf.transform.table_encode([*self._columns]) + columns, indices = libcudf.transform.table_encode(list(self._columns)) keys = self._from_columns_like_self(columns) return keys, indices @@ -1578,7 +1586,7 @@ def __neg__(self): col.unary_operator("not") if col.dtype.kind == "b" else -1 * col - for col in self._data.columns + for col in self._columns ) ) ) @@ -1840,9 +1848,7 @@ def __copy__(self): def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( - self._data._from_columns_like_self( - (~col for col in self._data.columns) - ) + self._data._from_columns_like_self((~col for col in self._columns)) ) @_performance_tracking diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6424c8af877..cb8cd0cd28b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. - left_cols = list( - self.grouping.keys.drop_duplicates()._data.columns - ) - right_cols = list(result_index._data.columns) + left_cols = list(self.grouping.keys.drop_duplicates()._columns) + right_cols = list(result_index._columns) join_keys = [ _match_join_keys(lcol, rcol, "left") for lcol, rcol in zip(left_cols, right_cols) @@ -1483,7 +1481,7 @@ def _post_process_chunk_results( # the column name should be, especially if we applied # a nameless UDF. result = result.to_frame( - name=grouped_values._data.names[0] + name=grouped_values._column_names[0] ) else: index_data = group_keys._data.copy(deep=True) @@ -1632,7 +1630,7 @@ def mult(df): if func in {"sum", "product"}: # For `sum` & `product`, boolean types # will need to result in `int64` type. - for name, col in res._data.items(): + for name, col in res._column_labels_and_values: if col.dtype.kind == "b": res._data[name] = col.astype("int") return res @@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): def _reduce_numeric_only(self, op: str): columns = list( name - for name in self.obj._data.names - if ( - is_numeric_dtype(self.obj._data[name].dtype) - and name not in self.grouping.names - ) + for name, dtype in self.obj._dtypes + if (is_numeric_dtype(dtype) and name not in self.grouping.names) ) return self[columns].agg(op) @@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame: """ # If the key columns are in `obj`, filter them out value_column_names = [ - x for x in self._obj._data.names if x not in self._named_columns + x for x in self._obj._column_names if x not in self._named_columns ] value_columns = self._obj._data.select_by_label(value_column_names) return self._obj.__class__._from_data(value_columns) @@ -3224,8 +3219,8 @@ def _handle_series(self, by): self.names.append(by.name) def _handle_index(self, by): - self._key_columns.extend(by._data.columns) - self.names.extend(by._data.names) + self._key_columns.extend(by._columns) + self.names.extend(by._column_names) def _handle_mapping(self, by): by = cudf.Series(by.values(), index=by.keys()) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b2bd20c4982..cd07c58c5d9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -122,13 +122,13 @@ def _lexsorted_equal_range( sort_inds = None sort_vals = idx lower_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="right", ascending=sort_vals.is_monotonic_increasing, @@ -286,6 +286,20 @@ def name(self): def name(self, value): self._name = value + @property + @_performance_tracking + def _column_names(self) -> tuple[Any]: + return (self.name,) + + @property + @_performance_tracking + def _columns(self) -> tuple[ColumnBase]: + return (self._values,) + + @property + def _column_labels_and_values(self) -> Iterable: + return zip(self._column_names, self._columns) + @property # type: ignore @_performance_tracking def start(self) -> int: @@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } data = self._apply_cupy_ufunc_to_operands( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fd6bf37f0e6..810d4ad74e7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -294,7 +294,7 @@ def _num_rows(self) -> int: @property def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? - return self.index._data.names + return self.index._column_names @classmethod def _from_data( @@ -307,6 +307,7 @@ def _from_data( raise ValueError( f"index must be None or a cudf.Index not {type(index).__name__}" ) + # out._num_rows requires .index to be defined out._index = RangeIndex(out._data.nrows) if index is None else index return out @@ -882,7 +883,7 @@ def replace( columns_dtype_map=dict(self._dtypes), ) copy_data = [] - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: try: replaced = col.find_and_replace( to_replace_per_column[name], @@ -2703,11 +2704,11 @@ def sort_index( by.extend( filter( lambda n: n not in handled, - self.index._data.names, + self.index._column_names, ) ) else: - by = list(idx._data.names) + by = list(idx._column_names) inds = idx._get_sorted_inds( by=by, ascending=ascending, na_position=na_position @@ -3013,7 +3014,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: columns_to_slice = [ *( - self.index._data.columns + self.index._columns if keep_index and not has_range_index else [] ), @@ -3210,7 +3211,7 @@ def _empty_like(self, keep_index=True) -> Self: result = self._from_columns_like_self( libcudf.copying.columns_empty_like( [ - *(self.index._data.columns if keep_index else ()), + *(self.index._columns if keep_index else ()), *self._columns, ] ), @@ -3227,7 +3228,7 @@ def _split(self, splits, keep_index=True): columns_split = libcudf.copying.columns_split( [ - *(self.index._data.columns if keep_index else []), + *(self.index._columns if keep_index else []), *self._columns, ], splits, @@ -3763,8 +3764,8 @@ def _reindex( idx_dtype_match = (df.index.nlevels == index.nlevels) and all( _is_same_dtype(left_dtype, right_dtype) for left_dtype, right_dtype in zip( - (col.dtype for col in df.index._data.columns), - (col.dtype for col in index._data.columns), + (dtype for _, dtype in df.index._dtypes), + (dtype for _, dtype in index._dtypes), ) ) @@ -3783,7 +3784,7 @@ def _reindex( (name or 0) if isinstance(self, cudf.Series) else name: col - for name, col in df._data.items() + for name, col in df._column_labels_and_values }, index=df.index, ) @@ -3794,7 +3795,7 @@ def _reindex( index = index if index is not None else df.index if column_names is None: - names = list(df._data.names) + names = list(df._column_names) level_names = self._data.level_names multiindex = self._data.multiindex rangeindex = self._data.rangeindex @@ -3948,7 +3949,7 @@ def round(self, decimals=0, how="half_even"): col.round(decimals[name], how=how) if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) - for name, col in self._data.items() + for name, col in self._column_labels_and_values ) return self._from_data_like_self( self._data._from_columns_like_self(cols) @@ -4270,7 +4271,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): else: thresh = len(df) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count @@ -4305,7 +4306,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( - [*self.index._data.columns, *data_columns], + [*self.index._columns, *data_columns], how=how, keys=self._positions_from_column_names( subset, offset_by_index_columns=True @@ -4853,7 +4854,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # This works for Index too inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } index = self.index @@ -4933,7 +4934,7 @@ def repeat(self, repeats, axis=None): """ res = self._from_columns_like_self( Frame._repeat( - [*self.index._data.columns, *self._columns], repeats, axis + [*self.index._columns, *self._columns], repeats, axis ), self._column_names, self._index_names, @@ -6224,7 +6225,7 @@ def _preprocess_subset(self, subset): not np.iterable(subset) or isinstance(subset, str) or isinstance(subset, tuple) - and subset in self._data.names + and subset in self._column_names ): subset = (subset,) diff = set(subset) - set(self._data) @@ -6306,8 +6307,8 @@ def rank( ) numeric_cols = ( name - for name in self._data.names - if _is_non_decimal_numeric_dtype(self._data[name]) + for name, dtype in self._dtypes + if _is_non_decimal_numeric_dtype(dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index b65bc7af832..cfeaca00888 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -140,11 +140,15 @@ def __init__( # right_on. self._using_left_index = bool(left_index) left_on = ( - lhs.index._data.names if left_index else left_on if left_on else on + lhs.index._column_names + if left_index + else left_on + if left_on + else on ) self._using_right_index = bool(right_index) right_on = ( - rhs.index._data.names + rhs.index._column_names if right_index else right_on if right_on @@ -334,18 +338,18 @@ def _merge_results( # All columns from the left table make it into the output. Non-key # columns that share a name with a column in the right table are # suffixed with the provided suffix. - common_names = set(left_result._data.names) & set( - right_result._data.names + common_names = set(left_result._column_names) & set( + right_result._column_names ) cols_to_suffix = common_names - self._key_columns_with_same_name data = { (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col - for name, col in left_result._data.items() + for name, col in left_result._column_labels_and_values } # The right table follows the same rule as the left table except that # key columns from the right table are removed. - for name, col in right_result._data.items(): + for name, col in right_result._column_labels_and_values: if name in common_names: if name not in self._key_columns_with_same_name: data[f"{name}{self.rsuffix}"] = col @@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # producing the input result. by: list[Any] = [] if self._using_left_index and self._using_right_index: - by.extend(result.index._data.columns) + by.extend(result.index._columns) if not self._using_left_index: by.extend([result._data[col.name] for col in self._left_keys]) if not self._using_right_index: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e00890ac5c3..6de3981ba66 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -36,7 +36,7 @@ from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name if TYPE_CHECKING: - from collections.abc import Generator + from collections.abc import Generator, Hashable from typing_extensions import Self @@ -233,8 +233,8 @@ def names(self, value): # to unexpected behavior in some cases. This is # definitely buggy, but we can't disallow non-unique # names either... - self._data = self._data.__class__( - dict(zip(value, self._data.values())), + self._data = type(self._data)( + dict(zip(value, self._columns)), level_names=self._data.level_names, verify=False, ) @@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False): @_performance_tracking def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" - lookup = cudf.DataFrame() + lookup_dict = {} for i, row in enumerate(row_tuple): if isinstance(row, slice) and row == slice(None): continue - lookup[i] = cudf.Series(row) - frame = cudf.DataFrame(dict(enumerate(index._data.columns))) + lookup_dict[i] = row + lookup = cudf.DataFrame(lookup_dict) + frame = cudf.DataFrame._from_data( + ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) data_table = cudf.concat( [ frame, cudf.DataFrame._from_data( - {"idx": column.as_column(range(len(frame)))} + ColumnAccessor( + {"idx": column.as_column(range(len(frame)))}, + verify=False, + ) ), ], axis=1, @@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): # TODO: Remove this after merge/join # obtain deterministic ordering. if cudf.get_option("mode.pandas_compatible"): - lookup_order = "_" + "_".join(map(str, lookup._data.names)) + lookup_order = "_" + "_".join(map(str, lookup._column_names)) lookup[lookup_order] = column.as_column(range(len(lookup))) postprocess = operator.methodcaller( "sort_values", by=[lookup_order, "idx"] @@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( out_index._num_columns, k, - cudf.Series._from_column(index._data.columns[k]), + cudf.Series._from_column(index._columns[k]), ) # determine if we should downcast from a DataFrame to a Series @@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key): ) if need_downcast: result = result.T - return result[result._data.names[0]] + return result[result._column_names[0]] if len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column result = cudf.Series._from_data( - {}, name=tuple(col[0] for col in index._data.columns) + {}, name=tuple(col[0] for col in index._columns) ) elif out_index._num_columns == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - *_, last_column = index._data.columns + last_column = index._columns[-1] out_index = cudf.Index._from_column( last_column, name=index.names[-1] ) @@ -894,7 +900,7 @@ def __eq__(self, other): [ self_col.equals(other_col) for self_col, other_col in zip( - self._data.values(), other._data.values() + self._columns, other._columns ) ] ) @@ -1041,9 +1047,11 @@ def to_frame( ) @_performance_tracking - def get_level_values(self, level) -> cudf.Index: + def _level_to_ca_label(self, level) -> tuple[Hashable, int]: """ - Return the values at the requested level + Convert a level to a ColumAccessor label and an integer position. + + Useful if self._column_names != self.names. Parameters ---------- @@ -1051,10 +1059,13 @@ def get_level_values(self, level) -> cudf.Index: Returns ------- - An Index containing the values at the requested level. + tuple[Hashable, int] + (ColumnAccessor label corresponding to level, integer position of the level) """ - colnames = self._data.names - if level not in colnames: + colnames = self._column_names + try: + level_idx = colnames.index(level) + except ValueError: if isinstance(level, int): if level < 0: level = level + len(colnames) @@ -1067,8 +1078,22 @@ def get_level_values(self, level) -> cudf.Index: level = colnames[level_idx] else: raise KeyError(f"Level not found: '{level}'") - else: - level_idx = colnames.index(level) + return level, level_idx + + @_performance_tracking + def get_level_values(self, level) -> cudf.Index: + """ + Return the values at the requested level + + Parameters + ---------- + level : int or label + + Returns + ------- + An Index containing the values at the requested level. + """ + level, level_idx = self._level_to_ca_label(level) level_values = cudf.Index._from_column( self._data[level], name=self.names[level_idx] ) @@ -1420,57 +1445,6 @@ def from_arrays( codes=codes, levels=levels, sortorder=sortorder, names=names ) - @_performance_tracking - def _poplevels(self, level) -> None | MultiIndex | cudf.Index: - """ - Remove and return the specified levels from self. - - Parameters - ---------- - level : level name or index, list - One or more levels to remove - - Returns - ------- - Index composed of the removed levels. If only a single level - is removed, a flat index is returned. If no levels are specified - (empty list), None is returned. - """ - if not pd.api.types.is_list_like(level): - level = (level,) - - ilevels = sorted(self._level_index_from_level(lev) for lev in level) - - if not ilevels: - return None - - popped_data = {} - popped_names = [] - names = list(self.names) - - # build the popped data and names - for i in ilevels: - n = self._data.names[i] - popped_data[n] = self._data[n] - popped_names.append(self.names[i]) - - # pop the levels out from self - # this must be done iterating backwards - for i in reversed(ilevels): - n = self._data.names[i] - names.pop(i) - popped_data[n] = self._data.pop(n) - - # construct the popped result - popped = cudf.core.index._index_from_data(popped_data) - popped.names = popped_names - - # update self - self.names = names - self._levels, self._codes = _compute_levels_and_codes(self._data) - - return popped - @_performance_tracking def swaplevel(self, i=-2, j=-1) -> Self: """ @@ -1507,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self: ('aa', 'b')], ) """ - name_i = self._data.names[i] if isinstance(i, int) else i - name_j = self._data.names[j] if isinstance(j, int) else j + name_i = self._column_names[i] if isinstance(i, int) else i + name_j = self._column_names[j] if isinstance(j, int) else j new_data = {} - for k, v in self._data.items(): + for k, v in self._column_labels_and_values: if k not in (name_i, name_j): new_data[k] = v elif k == name_i: @@ -1523,7 +1497,7 @@ def swaplevel(self, i=-2, j=-1) -> Self: return midx @_performance_tracking - def droplevel(self, level=-1) -> MultiIndex | cudf.Index: + def droplevel(self, level=-1) -> Self | cudf.Index: """ Removes the specified levels from the MultiIndex. @@ -1578,11 +1552,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index: >>> idx.droplevel(["first", "second"]) Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') """ - mi = self.copy(deep=False) - mi._poplevels(level) - if mi.nlevels == 1: - return mi.get_level_values(mi.names[0]) + if is_scalar(level): + level = (level,) + elif len(level) == 0: + return self + + new_names = list(self.names) + new_data = self._data.copy(deep=False) + for i in sorted( + (self._level_index_from_level(lev) for lev in level), reverse=True + ): + new_names.pop(i) + new_data.pop(self._data.names[i]) + + if len(new_data) == 1: + return cudf.core.index._index_from_data(new_data) else: + mi = MultiIndex._from_data(new_data) + mi.names = new_names return mi @_performance_tracking @@ -1886,7 +1873,7 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented - def _level_index_from_level(self, level): + def _level_index_from_level(self, level) -> int: """ Return level index from given level name or index """ @@ -1935,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): join_keys = [ _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(target._data.columns, self._data.columns) + for lcol, rcol in zip(target._columns, self._columns) ] join_keys = map(list, zip(*join_keys)) scatter_map, indices = libcudf.join.join( @@ -2132,7 +2119,7 @@ def _split_columns_by_levels( lv if isinstance(lv, int) else level_names.index(lv) for lv in levels } - for i, (name, col) in enumerate(zip(self.names, self._data.columns)): + for i, (name, col) in enumerate(zip(self.names, self._columns)): if in_levels and i in level_indices: name = f"level_{i}" if name is None else name yield name, col @@ -2173,9 +2160,7 @@ def _columns_for_reset_index( ) -> Generator[tuple[Any, column.ColumnBase], None, None]: """Return the columns and column names for .reset_index""" if levels is None: - for i, (col, name) in enumerate( - zip(self._data.columns, self.names) - ): + for i, (col, name) in enumerate(zip(self._columns, self.names)): yield f"level_{i}" if name is None else name, col else: yield from self._split_columns_by_levels(levels, in_levels=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index c026579b8b5..401fef67ee6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,6 +12,7 @@ from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default +from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column_accessor import ColumnAccessor @@ -409,7 +410,7 @@ def concat( result_columns = None if keys_objs is None: for o in objs: - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: if name in result_data: raise NotImplementedError( f"A Column with duplicate name found: {name}, cuDF " @@ -437,7 +438,7 @@ def concat( else: # All levels in the multiindex label must have the same type has_multiple_level_types = ( - len({type(name) for o in objs for name in o._data.keys()}) > 1 + len({type(name) for o in objs for name in o._column_names}) > 1 ) if has_multiple_level_types: raise NotImplementedError( @@ -446,7 +447,7 @@ def concat( "the labels to the same type." ) for k, o in zip(keys_objs, objs): - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: # if only series, then only keep keys_objs as column labels # if the existing column is multiindex, prepend it # to handle cases where dfs and srs are concatenated @@ -842,7 +843,7 @@ def get_dummies( else: result_data = { col_name: col - for col_name, col in data._data.items() + for col_name, col in data._column_labels_and_values if col_name not in columns } @@ -942,7 +943,7 @@ def _merge_sorted( columns = [ [ - *(obj.index._data.columns if not ignore_index else ()), + *(obj.index._columns if not ignore_index else ()), *obj._columns, ] for obj in objs @@ -984,7 +985,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._data.items(): + for col_label, col in df._column_labels_and_values: names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1008,7 +1009,7 @@ def as_tuple(x): ca = ColumnAccessor( result, multiindex=True, - level_names=(None,) + columns._data.names, + level_names=(None,) + columns._column_names, verify=False, ) return cudf.DataFrame._from_data( @@ -1086,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): # Create a DataFrame composed of columns from both # columns and index ca = ColumnAccessor( - dict( - enumerate( - itertools.chain(index._data.columns, columns._data.columns) - ) - ), + dict(enumerate(itertools.chain(index._columns, columns._columns))), verify=False, ) columns_index = cudf.DataFrame._from_data(ca) @@ -1227,13 +1224,24 @@ def unstack(df, level, fill_value=None, sort: bool = True): ) return res else: - df = df.copy(deep=False) - columns = df.index._poplevels(level) - index = df.index - result = _pivot(df, index, columns) - if result.index.nlevels == 1: - result.index = result.index.get_level_values(result.index.names[0]) - return result + index = df.index.droplevel(level) + if is_scalar(level): + columns = df.index.get_level_values(level) + else: + new_names = [] + ca_data = {} + for lev in level: + ca_level, level_idx = df.index._level_to_ca_label(lev) + new_names.append(df.index.names[level_idx]) + ca_data[ca_level] = df.index._data[ca_level] + columns = type(df.index)._from_data( + ColumnAccessor(ca_data, verify=False) + ) + columns.names = new_names + result = _pivot(df, index, columns) + if result.index.nlevels == 1: + result.index = result.index.get_level_values(result.index.names[0]) + return result def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase: @@ -1548,7 +1556,7 @@ def pivot_table( if values_passed and not values_multi and table._data.multiindex: column_names = table._data.level_names[1:] table_columns = tuple( - map(lambda column: column[1:], table._data.names) + map(lambda column: column[1:], table._column_names) ) table.columns = pd.MultiIndex.from_tuples( tuples=table_columns, names=column_names diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 7197560b5a4..68f34fa28ff 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -186,7 +186,7 @@ def to_datetime( if isinstance(arg, cudf.DataFrame): # we require at least Ymd required = ["year", "month", "day"] - req = list(set(required) - set(arg._data.names)) + req = list(set(required) - set(arg._column_names)) if len(req): err_req = ",".join(req) raise ValueError( @@ -196,7 +196,7 @@ def to_datetime( ) # replace passed column name with values in _unit_map - got_units = {k: get_units(k) for k in arg._data.names} + got_units = {k: get_units(k) for k in arg._column_names} unit_rev = {v: k for k, v in got_units.items()} # keys we don't recognize diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 265b87350ae..3af662b62ea 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args): # See https://github.com/numba/numba/issues/4587 return False - if any(col.has_nulls() for col in frame._data.values()): + if any(col.has_nulls() for col in frame._columns): return False np_field_types = np.dtype( list( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 6d7362952c9..bfe716f0afc 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()): def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - if str(col.dtype) in supported_types - else np.dtype("O") - for colname, col in frame._data.items() + colname: dtype if str(dtype) in supported_types else np.dtype("O") + for colname, dtype in frame._dtypes } def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - for colname, col in frame._data.items() - if str(col.dtype) in supported_types + colname: dtype + for colname, dtype in frame._dtypes + if str(dtype) in supported_types } def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col - for colname, col in frame._data.items() + for colname, col in frame._column_labels_and_values if str(col.dtype) in supported_types } @@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): *cudautils.make_cache_key( func, tuple(_all_dtypes_from_frame(frame).values()) ), - *(col.mask is None for col in frame._data.values()), - *frame._data.keys(), + *(col.mask is None for col in frame._columns), + *frame._column_names, scalar_argtypes, suffix, ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a9c20150930..3dc8915bfd1 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -186,13 +186,13 @@ def to_csv( "Dataframe doesn't have the labels provided in columns" ) - for col in df._data.columns: - if isinstance(col, cudf.core.column.ListColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.ListDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "list columns." ) - elif isinstance(col, cudf.core.column.StructColumn): + elif isinstance(dtype, cudf.StructDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "Struct columns." @@ -203,12 +203,11 @@ def to_csv( # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( - isinstance(col, cudf.core.column.CategoricalColumn) - for col in df._data.columns + isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes ) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) - for col_name, col in df._data.items(): - if isinstance(col, cudf.core.column.CategoricalColumn): + for col_name, col in df._column_labels_and_values: + if isinstance(col.dtype, cudf.CategoricalDtype): df._data[col_name] = col.astype(col.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 1347b2cc38f..fe8e446f9c0 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -79,13 +79,13 @@ def to_dlpack(cudf_obj): ) if any( - not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype) - for col in gdf._data.columns + not cudf.api.types._is_non_decimal_numeric_dtype(dtype) + for _, dtype in gdf._dtypes ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [col.dtype for col in gdf._data.columns] + [dtype for _, dtype in gdf._dtypes] ) gdf = gdf.astype(dtype) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index fd246c6215f..c54293badbe 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -396,8 +396,8 @@ def to_orc( ): """{docstring}""" - for col in df._data.columns: - if isinstance(col, cudf.core.column.CategoricalColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 62be7378e9e..ce99f98b559 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -64,7 +64,7 @@ def _write_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -149,7 +149,7 @@ def write_to_dataset( return_metadata=False, statistics="ROWGROUP", int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -205,7 +205,7 @@ def write_to_dataset( If ``False``, timestamps will not be altered. row_group_size_bytes: integer or None, default None Maximum size of each stripe of the output. - If None, 134217728 (128MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -980,7 +980,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index 3a82829eb7a..e0d3d9101a9 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -10,6 +10,7 @@ """ import argparse +import code import runpy import sys import tempfile @@ -21,6 +22,8 @@ @contextmanager def profile(function_profile, line_profile, fn): + if fn is None and (line_profile or function_profile): + raise RuntimeError("Enabling the profiler requires a script name.") if line_profile: with open(fn) as f: lines = f.readlines() @@ -54,6 +57,11 @@ def main(): dest="module", nargs=1, ) + parser.add_argument( + "-c", + dest="cmd", + nargs=1, + ) parser.add_argument( "--profile", action="store_true", @@ -72,9 +80,18 @@ def main(): args = parser.parse_args() + if args.cmd: + f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py") + f.write(args.cmd[0].encode()) + f.seek(0) + args.args.insert(0, f.name) + install() - with profile(args.profile, args.line_profile, args.args[0]) as fn: - args.args[0] = fn + + script_name = args.args[0] if len(args.args) > 0 else None + with profile(args.profile, args.line_profile, script_name) as fn: + if script_name is not None: + args.args[0] = fn if args.module: (module,) = args.module # run the module passing the remaining arguments @@ -85,6 +102,21 @@ def main(): # Remove ourself from argv and continue sys.argv[:] = args.args runpy.run_path(args.args[0], run_name="__main__") + else: + if sys.stdin.isatty(): + banner = f"Python {sys.version} on {sys.platform}" + site_import = not sys.flags.no_site + if site_import: + cprt = 'Type "help", "copyright", "credits" or "license" for more information.' + banner += "\n" + cprt + else: + # Don't show prompts or banners if stdin is not a TTY + sys.ps1 = "" + sys.ps2 = "" + banner = "" + + # Launch an interactive interpreter + code.interact(banner=banner, exitmsg="") if __name__ == "__main__": diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index afa1ce5f86c..0c1cda8810b 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -881,6 +881,26 @@ def _assert_fast_slow_eq(left, right): assert_eq(left, right) +class ProxyFallbackError(Exception): + """Raised when fallback occurs""" + + pass + + +def _fast_function_call(): + """ + Placeholder fast function for pytest profiling purposes. + """ + return None + + +def _slow_function_call(): + """ + Placeholder slow function for pytest profiling purposes. + """ + return None + + def _fast_slow_function_call( func: Callable, /, @@ -910,6 +930,7 @@ def _fast_slow_function_call( # try slow path raise Exception() fast = True + _fast_function_call() if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): try: with nvtx.annotate( @@ -942,6 +963,10 @@ def _fast_slow_function_call( f"The exception was {e}." ) except Exception as err: + if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False): + raise ProxyFallbackError( + f"The operation failed with cuDF, the reason was {type(err)}: {err}." + ) from err with nvtx.annotate( "EXECUTE_SLOW", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], @@ -952,6 +977,7 @@ def _fast_slow_function_call( from ._logger import log_fallback log_fallback(slow_args, slow_kwargs, err) + _slow_function_call() with disable_module_accelerator(): result = func(*slow_args, **slow_kwargs) return _maybe_wrap_result(result, func, *args, **kwargs), fast diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 505a40b0bfa..d12d2697729 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -1,10 +1,13 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import contextlib +import json import os import sys +import traceback +from collections import defaultdict from functools import wraps import pytest @@ -36,4 +39,58 @@ def patch_testing_functions(): pytest.raises = replace_kwargs({"match": None})(pytest.raises) +# Dictionary to store function call counts +function_call_counts = {} # type: ignore + +# The specific functions to track +FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"} + + +def find_pytest_file(frame): + stack = traceback.extract_stack() + absolute_paths = [frame.filename for frame in stack] + for file in absolute_paths: + if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[ + -1 + ].startswith("test_"): + return str(file).rsplit("pandas-tests/", 1)[-1] + return None + + +def trace_calls(frame, event, arg): + if event != "call": + return + code = frame.f_code + func_name = code.co_name + + if func_name in FUNCTION_NAME: + filename = find_pytest_file(frame) + if filename is None: + return + if filename not in function_call_counts: + function_call_counts[filename] = defaultdict(int) + function_call_counts[filename][func_name] += 1 + + +def pytest_sessionstart(session): + # Set the profile function to trace calls + sys.setprofile(trace_calls) + + +def pytest_sessionfinish(session, exitstatus): + # Remove the profile function + sys.setprofile(None) + + +@pytest.hookimpl(trylast=True) +def pytest_unconfigure(config): + if hasattr(config, "workerinput"): + # Running in xdist worker, write the counts before exiting + worker_id = config.workerinput["workerid"] + output_file = f"function_call_counts_worker_{worker_id}.json" + with open(output_file, "w") as f: + json.dump(function_call_counts, f, indent=4) + print(f"Function call counts have been written to {output_file}") + + sys.path.append(os.path.dirname(__file__)) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 9c65b74d081..9b9ce026571 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -64,8 +64,6 @@ markers = [ "skip_ubsan: Tests known to fail UBSAN check", ] EOF - # append the contents of patch-confest.py to conftest.py - cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py # Substitute `pandas.tests` with a relative import. # This will depend on the location of the test module relative to @@ -137,7 +135,7 @@ and not test_eof_states \ and not test_array_tz" # TODO: Remove "not db" once a postgres & mysql container is set up on the CI -PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ +PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \ -v -m "not single_cpu and not db" \ -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \ --import-mode=importlib \ @@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ mv *.json .. cd .. - rm -rf pandas-testing/pandas-tests/ diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index ffd2abb960d..4ea0b3b4413 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -12,7 +12,9 @@ """ import argparse +import glob import json +import os from rich.console import Console from rich.table import Table @@ -57,6 +59,44 @@ def get_per_module_results(log_file_name): per_module_results[module_name].setdefault(outcome, 0) per_module_results[module_name]["total"] += 1 per_module_results[module_name][outcome] += 1 + + directory = os.path.dirname(log_file_name) + pattern = os.path.join(directory, "function_call_counts_worker_*.json") + matching_files = glob.glob(pattern) + function_call_counts = {} + + for file in matching_files: + with open(file) as f: + function_call_count = json.load(f) + if not function_call_counts: + function_call_counts.update(function_call_count) + else: + for key, value in function_call_count.items(): + if key not in function_call_counts: + function_call_counts[key] = value + else: + if "_slow_function_call" not in function_call_counts[key]: + function_call_counts[key]["_slow_function_call"] = 0 + if "_fast_function_call" not in function_call_counts[key]: + function_call_counts[key]["_fast_function_call"] = 0 + function_call_counts[key]["_slow_function_call"] += ( + value.get("_slow_function_call", 0) + ) + function_call_counts[key]["_fast_function_call"] += ( + value.get("_fast_function_call", 0) + ) + + for key, value in per_module_results.items(): + if key in function_call_counts: + per_module_results[key]["_slow_function_call"] = ( + function_call_counts[key].get("_slow_function_call", 0) + ) + per_module_results[key]["_fast_function_call"] = ( + function_call_counts[key].get("_fast_function_call", 0) + ) + else: + per_module_results[key]["_slow_function_call"] = 0 + per_module_results[key]["_fast_function_call"] = 0 return per_module_results diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 31ad24a4664..668e7a77454 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -676,7 +676,7 @@ def assert_frame_equal( if check_like: left, right = left.reindex(index=right.index), right - right = right[list(left._data.names)] + right = right[list(left._column_names)] # index comparison assert_index_equal( diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini index 2136bca0e28..d05ba9aaacc 100644 --- a/python/cudf/cudf/tests/pytest.ini +++ b/python/cudf/cudf/tests/pytest.ini @@ -14,3 +14,6 @@ filterwarnings = ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning # PerformanceWarning from cupy warming up the JIT cache ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning + # Ignore numba PEP 456 warning specific to arm machines + ignore:FNV hashing is not implemented in Numba.*:UserWarning +addopts = --tb=native diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b1e095e8853..c41be3e4428 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): mi1 = gdf.groupby(["Date", "Symbol"]).mean().index mi2 = mi1.copy(deep=deep) - lchildren = [col.children for _, col in mi1._data.items()] - rchildren = [col.children for _, col in mi2._data.items()] + lchildren = [col.children for col in mi1._columns] + rchildren = [col.children for col in mi2._columns] # Flatten lchildren = reduce(operator.add, lchildren) @@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._data identity - lptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi1._data.items() - ] - rptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi2._data.items() - ] + lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] + rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) cudf.set_option("copy_on_write", original_cow_setting) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 52179f55da3..997ca357986 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -946,6 +946,66 @@ def test_minhash(): strings.str.minhash64(seeds=seeds) +def test_word_minhash(): + ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + + expected = cudf.Series( + [ + cudf.Series([21141582], dtype=np.uint32), + cudf.Series([962346254], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), + cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash(seeds=seeds) + assert_eq(expected, actual) + + expected = cudf.Series( + [ + cudf.Series([2603139454418834912], dtype=np.uint64), + cudf.Series([5240044617220523711], dtype=np.uint64), + ] + ) + actual = ls.str.word_minhash64() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + expected = cudf.Series( + [ + cudf.Series( + [ + 2603139454418834912, + 8644371945174847701, + 5541030711534384340, + ], + dtype=np.uint64, + ), + cudf.Series( + [5240044617220523711, 5847101123925041457, 153762819128779913], + dtype=np.uint64, + ), + ] + ) + actual = ls.str.word_minhash64(seeds=seeds) + assert_eq(expected, actual) + + # test wrong seed types + with pytest.raises(ValueError): + ls.str.word_minhash(seeds="a") + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.int32) + ls.str.word_minhash(seeds=seeds) + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + ls.str.word_minhash64(seeds=seeds) + + def test_jaccard_index(): str1 = cudf.Series(["the brown dog", "jumped about"]) str2 = cudf.Series(["the black cat", "jumped around"]) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1627107b57d..1180da321e6 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -27,7 +27,7 @@ fsspec_parquet = None _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 +_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for @@ -275,10 +275,9 @@ timestamp[us] to the int96 format, which is the number of Julian days and the number of nanoseconds since midnight of 1970-01-01. If ``False``, timestamps will not be altered. -row_group_size_bytes: integer, default {row_group_size_bytes_val} +row_group_size_bytes: integer, default None Maximum size of each stripe of the output. - If None, {row_group_size_bytes_val} - ({row_group_size_bytes_val_in_mb} MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -346,10 +345,7 @@ See Also -------- cudf.read_parquet -""".format( - row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT, - row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024, -) +""" doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) _docstring_merge_parquet_filemetadata = """ diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index c4ab4b0a853..2bbed40e34e 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -26,7 +26,11 @@ from cudf.core._compat import PANDAS_GE_220 from cudf.pandas import LOADED, Profiler -from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object +from cudf.pandas.fast_slow_proxy import ( + ProxyFallbackError, + _Unusable, + is_proxy_object, +) from cudf.testing import assert_eq if not LOADED: @@ -1738,3 +1742,13 @@ def add_one_ufunc(a): return a + 1 assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2))) + + +@pytest.mark.xfail( + reason="Fallback expected because casting to object is not supported", +) +def test_fallback_raises_error(monkeypatch): + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + with pytest.raises(ProxyFallbackError): + pd.Series(range(2)).astype(object) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py new file mode 100644 index 00000000000..896256bf6d7 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from cudf.pandas import LOADED + +if not LOADED: + raise ImportError("These tests must be run with cudf.pandas loaded") + +import numpy as np +import pandas as pd + + +@pytest.fixture(autouse=True) +def fail_on_fallback(monkeypatch): + monkeypatch.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + + +@pytest.fixture +def dataframe(): + df = pd.DataFrame( + { + "a": [1, 1, 1, 2, 3], + "b": [1, 2, 3, 4, 5], + "c": [1.2, 1.3, 1.5, 1.7, 1.11], + } + ) + return df + + +@pytest.fixture +def series(dataframe): + return dataframe["a"] + + +@pytest.fixture +def array(series): + return series.values + + +@pytest.mark.parametrize( + "op", + [ + "sum", + "min", + "max", + "mean", + "std", + "var", + "prod", + "median", + ], +) +def test_no_fallback_in_reduction_ops(series, op): + s = series + getattr(s, op)() + + +def test_groupby(dataframe): + df = dataframe + df.groupby("a", sort=True).max() + + +def test_no_fallback_in_binops(dataframe): + df = dataframe + df + df + df - df + df * df + df**df + df[["a", "b"]] & df[["a", "b"]] + df <= df + + +def test_no_fallback_in_groupby_rolling_sum(dataframe): + df = dataframe + df.groupby("a").rolling(2).sum() + + +def test_no_fallback_in_concat(dataframe): + df = dataframe + pd.concat([df, df]) + + +def test_no_fallback_in_get_shape(dataframe): + df = dataframe + df.shape + + +def test_no_fallback_in_array_ufunc_op(array): + np.add(array, array) + + +def test_no_fallback_in_merge(dataframe): + df = dataframe + pd.merge(df * df, df + df, how="inner") + pd.merge(df * df, df + df, how="outer") + pd.merge(df * df, df + df, how="left") + pd.merge(df * df, df + df, how="right") diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py new file mode 100644 index 00000000000..326224c8fc0 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_main.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import tempfile +import textwrap + + +def _run_python(*, cudf_pandas, command): + executable = "python " + if cudf_pandas: + executable += "-m cudf.pandas " + return subprocess.run( + executable + command, + shell=True, + capture_output=True, + check=True, + text=True, + ) + + +def test_run_cudf_pandas_with_script(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f: + code = textwrap.dedent( + """ + import pandas as pd + df = pd.DataFrame({'a': [1, 2, 3]}) + print(df['a'].sum()) + """ + ) + f.write(code) + f.flush() + + res = _run_python(cudf_pandas=True, command=f.name) + expect = _run_python(cudf_pandas=False, command=f.name) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args(): + input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf(): + """Verify that cudf is active with -m cudf.pandas.""" + input_args_and_code = """-c 'import pandas as pd; print(pd)'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert "cudf" in res.stdout + assert "cudf" not in expect.stdout + + +def test_cudf_pandas_script_repl(): + def start_repl_process(cmd): + return subprocess.Popen( + cmd.split(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True, + ) + + def get_repl_output(process, commands): + for command in commands: + process.stdin.write(command) + process.stdin.flush() + return process.communicate() + + p1 = start_repl_process("python -m cudf.pandas") + p2 = start_repl_process("python") + commands = [ + "import pandas as pd\n", + "print(pd.Series(range(2)).sum())\n", + "print(pd.Series(range(5)).sum())\n", + "import sys\n", + "print(pd.Series(list('abcd')), out=sys.stderr)\n", + ] + + res = get_repl_output(p1, commands) + expect = get_repl_output(p2, commands) + + # Check stdout + assert res[0] != "" + assert res[0] == expect[0] + + # Check stderr + assert res[1] != "" + assert res[1] == expect[1] + + p1.kill() + p2.kill() diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index f742f46c7ed..84b731e6c51 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -182,7 +182,7 @@ dependencies: common: - output_types: conda packages: - - cudf==24.10.*,>=0.0.0a0 + - cudf==24.12.*,>=0.0.0a0 - pandas - pytest - pytest-xdist @@ -248,13 +248,13 @@ dependencies: common: - output_types: conda packages: - - cuml==24.10.*,>=0.0.0a0 + - cuml==24.12.*,>=0.0.0a0 - scikit-learn test_cugraph: common: - output_types: conda packages: - - cugraph==24.10.*,>=0.0.0a0 + - cugraph==24.12.*,>=0.0.0a0 - networkx test_ibis: common: diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini index 817d98e6ba2..98459035298 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini @@ -1,3 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + [pytest] xfail_strict=true markers= @@ -5,3 +7,4 @@ markers= xfail_gold: this test is expected to fail in the gold pass xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass xfail_compare: this test is expected to fail in the comparison pass +addopts = --tb=native diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 5833ee43c07..f90cb96e189 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "cuda-python>=11.7.1,<12.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "libcudf==24.10.*,>=0.0.0a0", + "libcudf==24.12.*,>=0.0.0a0", "numba>=0.57", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", @@ -31,9 +31,9 @@ dependencies = [ "pandas>=2.0,<2.2.3dev0", "ptxcompiler", "pyarrow>=14.0.0,<18.0.0a0", - "pylibcudf==24.10.*,>=0.0.0a0", + "pylibcudf==24.12.*,>=0.0.0a0", "rich", - "rmm==24.10.*,>=0.0.0a0", + "rmm==24.12.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -131,11 +131,11 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", - "libcudf==24.10.*,>=0.0.0a0", - "librmm==24.10.*,>=0.0.0a0", + "libcudf==24.12.*,>=0.0.0a0", + "librmm==24.12.*,>=0.0.0a0", "ninja", - "pylibcudf==24.10.*,>=0.0.0a0", - "rmm==24.10.*,>=0.0.0a0", + "pylibcudf==24.12.*,>=0.0.0a0", + "rmm==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [tool.scikit-build] diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 6ca798bb11c..a1a3ec37842 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cudf==24.10.*,>=0.0.0a0", + "cudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.optional-dependencies] diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 41d06f8631b..c1317e8f467 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -10,10 +10,14 @@ from __future__ import annotations +# Check we have a supported polars version +import cudf_polars.utils.versions as v from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir +del v + __all__: list[str] = [ "execute_with_cudf", "translate_ir", diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index f31193aa938..76816ee0a61 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -5,19 +5,26 @@ from __future__ import annotations +import contextlib import os import warnings -from functools import partial +from functools import cache, partial from typing import TYPE_CHECKING import nvtx -from polars.exceptions import PerformanceWarning +from polars.exceptions import ComputeError, PerformanceWarning + +import rmm +from rmm._cuda import gpu from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: + from collections.abc import Generator + import polars as pl + from polars import GPUEngine from cudf_polars.dsl.ir import IR from cudf_polars.typing import NodeTraverser @@ -25,23 +32,126 @@ __all__: list[str] = ["execute_with_cudf"] +@cache +def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: + """ + Return the default memory resource for cudf-polars. + + Parameters + ---------- + device + Disambiguating device id when selecting the device. Must be + the active device when this function is called. + + Returns + ------- + rmm.mr.DeviceMemoryResource + The default memory resource that cudf-polars uses. Currently + an async pool resource. + """ + try: + return rmm.mr.CudaAsyncMemoryResource() + except RuntimeError as e: # pragma: no cover + msg, *_ = e.args + if ( + msg.startswith("RMM failure") + and msg.find("not supported with this CUDA driver/runtime version") > -1 + ): + raise ComputeError( + "GPU engine requested, but incorrect cudf-polars package installed. " + "If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` " + "and install `cudf-polars-cu11`" + ) from None + else: + raise + + +@contextlib.contextmanager +def set_memory_resource( + mr: rmm.mr.DeviceMemoryResource | None, +) -> Generator[rmm.mr.DeviceMemoryResource, None, None]: + """ + Set the current memory resource for an execution block. + + Parameters + ---------- + mr + Memory resource to use. If `None`, calls :func:`default_memory_resource` + to obtain an mr on the currently active device. + + Returns + ------- + Memory resource used. + + Notes + ----- + At exit, the memory resource is restored to whatever was current + at entry. If a memory resource is provided, it must be valid to + use with the currently active device. + """ + if mr is None: + device: int = gpu.getDevice() + mr = default_memory_resource(device) + previous = rmm.mr.get_current_device_resource() + rmm.mr.set_current_device_resource(mr) + try: + yield mr + finally: + rmm.mr.set_current_device_resource(previous) + + +@contextlib.contextmanager +def set_device(device: int | None) -> Generator[int, None, None]: + """ + Set the device the query is executed on. + + Parameters + ---------- + device + Device to use. If `None`, uses the current device. + + Returns + ------- + Device active for the execution of the block. + + Notes + ----- + At exit, the device is restored to whatever was current at entry. + """ + previous: int = gpu.getDevice() + if device is not None: + gpu.setDevice(device) + try: + yield previous + finally: + gpu.setDevice(previous) + + def _callback( ir: IR, with_columns: list[str] | None, pyarrow_predicate: str | None, n_rows: int | None, + *, + device: int | None, + memory_resource: int | None, ) -> pl.DataFrame: assert with_columns is None assert pyarrow_predicate is None assert n_rows is None - with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"): + with ( + nvtx.annotate(message="ExecuteIR", domain="cudf_polars"), + # Device must be set before memory resource is obtained. + set_device(device), + set_memory_resource(memory_resource), + ): return ir.evaluate(cache={}).to_polars() def execute_with_cudf( nt: NodeTraverser, *, - raise_on_fail: bool = False, + config: GPUEngine, exception: type[Exception] | tuple[type[Exception], ...] = Exception, ) -> None: """ @@ -52,9 +162,8 @@ def execute_with_cudf( nt NodeTraverser - raise_on_fail - Should conversion raise an exception rather than continuing - without setting a callback. + config + GPUEngine configuration object exception Optional exception, or tuple of exceptions, to catch during @@ -62,9 +171,23 @@ def execute_with_cudf( The NodeTraverser is mutated if the libcudf executor can handle the plan. """ + device = config.device + memory_resource = config.memory_resource + raise_on_fail = config.config.get("raise_on_fail", False) + if unsupported := (config.config.keys() - {"raise_on_fail"}): + raise ValueError( + f"Engine configuration contains unsupported settings {unsupported}" + ) try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - nt.set_udf(partial(_callback, translate_ir(nt))) + nt.set_udf( + partial( + _callback, + translate_ir(nt), + device=device, + memory_resource=memory_resource, + ) + ) except exception as e: if bool(int(os.environ.get("POLARS_VERBOSE", 0))): warnings.warn( diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index dd3b771e305..3fe3e5557cb 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -84,6 +84,34 @@ def sorted_like(self, like: Column, /) -> Self: is_sorted=like.is_sorted, order=like.order, null_order=like.null_order ) + # TODO: Return Column once #16272 is fixed. + def astype(self, dtype: plc.DataType) -> plc.Column: + """ + Return the backing column as the requested dtype. + + Parameters + ---------- + dtype + Datatype to cast to. + + Returns + ------- + Column of requested type. + + Raises + ------ + RuntimeError + If the cast is unsupported. + + Notes + ----- + This only produces a copy if the requested dtype doesn't match + the current one. + """ + if self.obj.type() != dtype: + return plc.unary.cast(self.obj, dtype) + return self.obj + def copy_metadata(self, from_: pl.Series, /) -> Self: """ Copy metadata from a host series onto self. diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index a5c99e2bc11..f3e3862d0cc 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -7,7 +7,7 @@ import itertools from functools import cached_property -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import pyarrow as pa import pylibcudf as plc @@ -45,11 +45,19 @@ def copy(self) -> Self: def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" + # If the arrow table has empty names, from_arrow produces + # column_$i. But here we know there is only one such column + # (by construction) and it should have an empty name. + # https://github.com/pola-rs/polars/issues/11632 + # To guarantee we produce correct names, we therefore + # serialise with names we control and rename with that map. + name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)} table: pa.Table = plc.interop.to_arrow( self.table, - [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], + [plc.interop.ColumnMetadata(name=name) for name in name_map], ) - return cast(pl.DataFrame, pl.from_arrow(table)).with_columns( + df: pl.DataFrame = pl.from_arrow(table) + return df.rename(name_map).with_columns( *( pl.col(c.name).set_sorted( descending=c.order == plc.types.Order.DESCENDING diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index e1b4d30b76b..c401e5a2f17 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -21,8 +21,10 @@ from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple import pyarrow as pa +import pyarrow.compute as pc import pylibcudf as plc +from polars.exceptions import InvalidOperationError from polars.polars import _expr_nodes as pl_expr from cudf_polars.containers import Column, NamedColumn @@ -477,12 +479,6 @@ def __init__( self.options = options self.name = name self.children = children - if ( - self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All) - and not self.options[0] - ): - # With ignore_nulls == False, polars uses Kleene logic - raise NotImplementedError(f"Kleene logic for {self.name}") if self.name == pl_expr.BooleanFunction.IsIn and not all( c.dtype == self.children[0].dtype for c in self.children ): @@ -577,20 +573,31 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.BooleanFunction.Any: + # Kleene logic for Any (OR) and All (AND) if ignore_nulls is + # False + if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): + (ignore_nulls,) = self.options (column,) = columns - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1 - ) - ) - elif self.name == pl_expr.BooleanFunction.All: - (column,) = columns - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1 - ) - ) + is_any = self.name == pl_expr.BooleanFunction.Any + agg = plc.aggregation.any() if is_any else plc.aggregation.all() + result = plc.reduce.reduce(column.obj, agg, self.dtype) + if not ignore_nulls and column.obj.null_count() > 0: + # Truth tables + # Any All + # | F U T | F U T + # --+------ --+------ + # F | F U T F | F F F + # U | U U T U | F U U + # T | T T T T | F U T + # + # If the input null count was non-zero, we must + # post-process the result to insert the correct value. + h_result = plc.interop.to_arrow(result).as_py() + if is_any and not h_result or not is_any and h_result: + # Any All + # False || Null => Null True && Null => Null + return Column(plc.Column.all_null_like(column.obj, 1)) + return Column(plc.Column.from_scalar(result, 1)) if self.name == pl_expr.BooleanFunction.IsNull: (column,) = columns return Column(plc.unary.is_null(column.obj)) @@ -598,13 +605,19 @@ def do_evaluate( (column,) = columns return Column(plc.unary.is_valid(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNan: - # TODO: copy over null mask since is_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_nan(column.obj)) + return Column( + plc.unary.is_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) elif self.name == pl_expr.BooleanFunction.IsNotNan: - # TODO: copy over null mask since is_not_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_not_nan(column.obj)) + return Column( + plc.unary.is_not_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: (column,) = columns return self._distinct( @@ -654,26 +667,22 @@ def do_evaluate( ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: - if any(c.obj.null_count() > 0 for c in columns): - raise NotImplementedError("Kleene logic for all_horizontal") return Column( reduce( partial( plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.BITWISE_AND, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, output_type=self.dtype, ), (c.obj for c in columns), ) ) elif self.name == pl_expr.BooleanFunction.AnyHorizontal: - if any(c.obj.null_count() > 0 for c in columns): - raise NotImplementedError("Kleene logic for any_horizontal") return Column( reduce( partial( plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.BITWISE_OR, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, output_type=self.dtype, ), (c.obj for c in columns), @@ -694,7 +703,7 @@ def do_evaluate( class StringFunction(Expr): - __slots__ = ("name", "options", "children") + __slots__ = ("name", "options", "children", "_regex_program") _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -713,12 +722,18 @@ def __init__( def _validate_input(self): if self.name not in ( - pl_expr.StringFunction.Lowercase, - pl_expr.StringFunction.Uppercase, - pl_expr.StringFunction.EndsWith, - pl_expr.StringFunction.StartsWith, pl_expr.StringFunction.Contains, + pl_expr.StringFunction.EndsWith, + pl_expr.StringFunction.Lowercase, + pl_expr.StringFunction.Replace, + pl_expr.StringFunction.ReplaceMany, pl_expr.StringFunction.Slice, + pl_expr.StringFunction.Strptime, + pl_expr.StringFunction.StartsWith, + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + pl_expr.StringFunction.Uppercase, ): raise NotImplementedError(f"String function {self.name}") if self.name == pl_expr.StringFunction.Contains: @@ -732,11 +747,65 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + pattern = self.children[1].value.as_py() + try: + self._regex_program = plc.strings.regex_program.RegexProgram.create( + pattern, + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + except RuntimeError as e: + raise NotImplementedError( + f"Unsupported regex {pattern} for GPU engine." + ) from e + elif self.name == pl_expr.StringFunction.Replace: + _, literal = self.options + if not literal: + raise NotImplementedError("literal=False is not supported for replace") + if not all(isinstance(expr, Literal) for expr in self.children[1:]): + raise NotImplementedError("replace only supports scalar target") + target = self.children[1] + if target.value == pa.scalar("", type=pa.string()): + raise NotImplementedError( + "libcudf replace does not support empty strings" + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + (ascii_case_insensitive,) = self.options + if ascii_case_insensitive: + raise NotImplementedError( + "ascii_case_insensitive not implemented for replace_many" + ) + if not all( + isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:] + ): + raise NotImplementedError("replace_many only supports literal inputs") + target = self.children[1] + if pc.any(pc.equal(target.value, "")).as_py(): + raise NotImplementedError( + "libcudf replace_many is implemented differently from polars " + "for empty strings" + ) elif self.name == pl_expr.StringFunction.Slice: if not all(isinstance(child, Literal) for child in self.children[1:]): raise NotImplementedError( "Slice only supports literal start and stop values" ) + elif self.name == pl_expr.StringFunction.Strptime: + format, _, exact, cache = self.options + if cache: + raise NotImplementedError("Strptime cache is a CPU feature") + if format is None: + raise NotImplementedError("Strptime format is required") + if not exact: + raise NotImplementedError("Strptime does not support exact=False") + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "strip operations only support scalar patterns" + ) def do_evaluate( self, @@ -759,12 +828,10 @@ def do_evaluate( else pat.obj ) return Column(plc.strings.find.contains(column.obj, pattern)) - assert isinstance(arg, Literal) - prog = plc.strings.regex_program.RegexProgram.create( - arg.value.as_py(), - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, - ) - return Column(plc.strings.contains.contains_re(column.obj, prog)) + else: + return Column( + plc.strings.contains.contains_re(column.obj, self._regex_program) + ) elif self.name == pl_expr.StringFunction.Slice: child, expr_offset, expr_length = self.children assert isinstance(expr_offset, Literal) @@ -795,6 +862,22 @@ def do_evaluate( plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), ) ) + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = ( + c.evaluate(df, context=context, mapping=mapping) for c in self.children + ) + if self.name == pl_expr.StringFunction.StripCharsStart: + side = plc.strings.SideType.LEFT + elif self.name == pl_expr.StringFunction.StripCharsEnd: + side = plc.strings.SideType.RIGHT + else: + side = plc.strings.SideType.BOTH + return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar)) + columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -825,6 +908,51 @@ def do_evaluate( else prefix.obj, ) ) + elif self.name == pl_expr.StringFunction.Strptime: + # TODO: ignores ambiguous + format, strict, exact, cache = self.options + col = self.children[0].evaluate(df, context=context, mapping=mapping) + + is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( + col.obj, format.encode() + ) + + if strict: + if not plc.interop.to_arrow( + plc.reduce.reduce( + is_timestamps, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("conversion from `str` failed.") + else: + not_timestamps = plc.unary.unary_operation( + is_timestamps, plc.unary.UnaryOperator.NOT + ) + + null = plc.interop.from_arrow(pa.scalar(None, type=pa.string())) + res = plc.copying.boolean_mask_scatter( + [null], plc.Table([col.obj]), not_timestamps + ) + return Column( + plc.strings.convert.convert_datetime.to_timestamps( + res.columns()[0], self.dtype, format.encode() + ) + ) + elif self.name == pl_expr.StringFunction.Replace: + column, target, repl = columns + n, _ = self.options + return Column( + plc.strings.replace.replace( + column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n + ) + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + column, target, repl = columns + return Column( + plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) + ) raise NotImplementedError( f"StringFunction {self.name}" ) # pragma: no cover; handled by init raising @@ -832,6 +960,18 @@ def do_evaluate( class TemporalFunction(Expr): __slots__ = ("name", "options", "children") + _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { + pl_expr.TemporalFunction.Year: "year", + pl_expr.TemporalFunction.Month: "month", + pl_expr.TemporalFunction.Day: "day", + pl_expr.TemporalFunction.WeekDay: "weekday", + pl_expr.TemporalFunction.Hour: "hour", + pl_expr.TemporalFunction.Minute: "minute", + pl_expr.TemporalFunction.Second: "second", + pl_expr.TemporalFunction.Millisecond: "millisecond", + pl_expr.TemporalFunction.Microsecond: "microsecond", + pl_expr.TemporalFunction.Nanosecond: "nanosecond", + } _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -846,8 +986,8 @@ def __init__( self.options = options self.name = name self.children = children - if self.name != pl_expr.TemporalFunction.Year: - raise NotImplementedError(f"String function {self.name}") + if self.name not in self._COMPONENT_MAP: + raise NotImplementedError(f"Temporal function {self.name}") def do_evaluate( self, @@ -861,12 +1001,59 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.TemporalFunction.Year: - (column,) = columns - return Column(plc.datetime.extract_year(column.obj)) - raise NotImplementedError( - f"TemporalFunction {self.name}" - ) # pragma: no cover; init trips first + (column,) = columns + if self.name == pl_expr.TemporalFunction.Microsecond: + millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") + micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + millis_as_micros = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.DataType(plc.TypeId.INT32), + ) + total_micros = plc.binaryop.binary_operation( + micros, + millis_as_micros, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_micros) + elif self.name == pl_expr.TemporalFunction.Nanosecond: + millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") + micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond") + millis_as_nanos = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + micros_as_nanos = plc.binaryop.binary_operation( + micros, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + nanos, + millis_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + total_nanos, + micros_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_nanos) + + return Column( + plc.datetime.extract_datetime_component( + column.obj, + self._COMPONENT_MAP[self.name], + ) + ) class UnaryFunction(Expr): @@ -874,6 +1061,51 @@ class UnaryFunction(Expr): _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] + # Note: log, and pow are handled via translation to binops + _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = { + "sin": plc.unary.UnaryOperator.SIN, + "cos": plc.unary.UnaryOperator.COS, + "tan": plc.unary.UnaryOperator.TAN, + "arcsin": plc.unary.UnaryOperator.ARCSIN, + "arccos": plc.unary.UnaryOperator.ARCCOS, + "arctan": plc.unary.UnaryOperator.ARCTAN, + "sinh": plc.unary.UnaryOperator.SINH, + "cosh": plc.unary.UnaryOperator.COSH, + "tanh": plc.unary.UnaryOperator.TANH, + "arcsinh": plc.unary.UnaryOperator.ARCSINH, + "arccosh": plc.unary.UnaryOperator.ARCCOSH, + "arctanh": plc.unary.UnaryOperator.ARCTANH, + "exp": plc.unary.UnaryOperator.EXP, + "sqrt": plc.unary.UnaryOperator.SQRT, + "cbrt": plc.unary.UnaryOperator.CBRT, + "ceil": plc.unary.UnaryOperator.CEIL, + "floor": plc.unary.UnaryOperator.FLOOR, + "abs": plc.unary.UnaryOperator.ABS, + "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, + "not": plc.unary.UnaryOperator.NOT, + } + _supported_misc_fns = frozenset( + { + "drop_nulls", + "fill_null", + "mask_nans", + "round", + "set_sorted", + "unique", + } + ) + _supported_cum_aggs = frozenset( + { + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + } + ) + _supported_fns = frozenset().union( + _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys() + ) + def __init__( self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr ) -> None: @@ -881,15 +1113,15 @@ def __init__( self.name = name self.options = options self.children = children - if self.name not in ( - "mask_nans", - "round", - "setsorted", - "unique", - "dropnull", - "fill_null", - ): + + if self.name not in UnaryFunction._supported_fns: raise NotImplementedError(f"Unary function {name=}") + if self.name in UnaryFunction._supported_cum_aggs: + (reverse,) = self.options + if reverse: + raise NotImplementedError( + "reverse=True is not supported for cumulative aggregations" + ) def do_evaluate( self, @@ -947,7 +1179,7 @@ def do_evaluate( if maintain_order: return Column(column).sorted_like(values) return Column(column) - elif self.name == "setsorted": + elif self.name == "set_sorted": (column,) = ( child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -974,7 +1206,7 @@ def do_evaluate( order=order, null_order=null_order, ) - elif self.name == "dropnull": + elif self.name == "drop_nulls": (column,) = ( child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -994,13 +1226,65 @@ def do_evaluate( ) arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj return Column(plc.replace.replace_nulls(column.obj, arg)) - + elif self.name in self._OP_MAPPING: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if column.obj.type().id() != self.dtype.id(): + arg = plc.unary.cast(column.obj, self.dtype) + else: + arg = column.obj + return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name])) + elif self.name in UnaryFunction._supported_cum_aggs: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + plc_col = column.obj + col_type = column.obj.type() + # cum_sum casts + # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention + # Bool -> UInt32 + # cum_prod casts integer dtypes < int64 and bool to int64 + # See: + # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs + if ( + self.name == "cum_sum" + and col_type.id() + in { + plc.types.TypeId.INT8, + plc.types.TypeId.UINT8, + plc.types.TypeId.INT16, + plc.types.TypeId.UINT16, + } + ) or ( + self.name == "cum_prod" + and plc.traits.is_integral(col_type) + and plc.types.size_of(col_type) <= 4 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.INT64) + ) + elif ( + self.name == "cum_sum" + and column.obj.type().id() == plc.types.TypeId.BOOL8 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.UINT32) + ) + if self.name == "cum_sum": + agg = plc.aggregation.sum() + elif self.name == "cum_prod": + agg = plc.aggregation.product() + elif self.name == "cum_min": + agg = plc.aggregation.min() + elif self.name == "cum_max": + agg = plc.aggregation.max() + + return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE)) raise NotImplementedError( f"Unimplemented unary function {self.name=}" ) # pragma: no cover; init trips first def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" + if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs: + raise NotImplementedError(f"{self.name} in groupby") if depth == 1: # inside aggregation, need to pre-evaluate, groupby # construction has checked that we don't have nested aggs, @@ -1187,11 +1471,7 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) - if not ( - plc.traits.is_fixed_width(self.dtype) - and plc.traits.is_fixed_width(value.dtype) - and plc.unary.is_supported_cast(value.dtype, self.dtype) - ): + if not dtypes.can_cast(value.dtype, self.dtype): raise NotImplementedError( f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" ) @@ -1255,6 +1535,13 @@ def __init__( req = plc.aggregation.variance(ddof=options) elif name == "count": req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) + elif name == "quantile": + _, quantile = self.children + if not isinstance(quantile, Literal): + raise NotImplementedError("Only support literal quantile values") + req = plc.aggregation.quantile( + quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options] + ) else: raise NotImplementedError( f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" @@ -1286,9 +1573,18 @@ def __init__( "count", "std", "var", + "quantile", ] ) + interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = { + "nearest": plc.types.Interpolation.NEAREST, + "higher": plc.types.Interpolation.HIGHER, + "lower": plc.types.Interpolation.LOWER, + "midpoint": plc.types.Interpolation.MIDPOINT, + "linear": plc.types.Interpolation.LINEAR, + } + def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" if depth >= 1: @@ -1299,7 +1595,19 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("Nan propagation in groupby for min/max") (child,) = self.children ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests - if self.request is None: + request = self.request + # These are handled specially here because we don't set up the + # request for the whole-frame agg because we can avoid a + # reduce for these. + if self.name == "first": + request = plc.aggregation.nth_element( + 0, null_handling=plc.types.NullPolicy.INCLUDE + ) + elif self.name == "last": + request = plc.aggregation.nth_element( + -1, null_handling=plc.types.NullPolicy.INCLUDE + ) + if request is None: raise NotImplementedError( f"Aggregation {self.name} in groupby" ) # pragma: no cover; __init__ trips first @@ -1308,7 +1616,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: # Ignore nans in these groupby aggs, do this by masking # nans in the input expr = UnaryFunction(self.dtype, "mask_nans", (), expr) - return AggInfo([(expr, self.request, self)]) + return AggInfo([(expr, request, self)]) def _reduce( self, column: Column, *, request: plc.aggregation.Aggregation @@ -1380,7 +1688,10 @@ def do_evaluate( raise NotImplementedError( f"Agg in context {context}" ) # pragma: no cover; unreachable - (child,) = self.children + + # Aggregations like quantiles may have additional children that were + # preprocessed into pylibcudf requests. + child = self.children[0] return self.op(child.evaluate(df, context=context, mapping=mapping)) @@ -1425,6 +1736,11 @@ def __init__( right: Expr, ) -> None: super().__init__(dtype) + if plc.traits.is_boolean(self.dtype): + # For boolean output types, bitand and bitor implement + # boolean logic, so translate. bitxor also does, but the + # default behaviour is correct. + op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) self.op = op self.children = (left, right) if not plc.binaryop.is_supported_operation( @@ -1436,6 +1752,15 @@ def __init__( f"with output type {self.dtype.id().name}" ) + _BOOL_KLEENE_MAPPING: ClassVar[ + dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator] + ] = { + plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + } + _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index e334e6f5cc5..8cd56c8ee3a 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,7 +15,6 @@ import dataclasses import itertools -import types from functools import cache from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar @@ -28,7 +27,7 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import DataFrame, NamedColumn -from cudf_polars.utils import sorting +from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: from collections.abc import Callable, MutableMapping @@ -133,8 +132,7 @@ class IR: def __post_init__(self): """Validate preconditions.""" - if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()): - raise NotImplementedError("Cannot make empty columns.") + pass # noqa: PIE790 def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ @@ -189,32 +187,42 @@ class Scan(IR): """Cloud-related authentication options, currently ignored.""" paths: list[str] """List of paths to read from.""" - file_options: Any - """Options for reading the file. - - Attributes are: - - ``with_columns: list[str]`` of projected columns to return. - - ``n_rows: int``: Number of rows to read. - - ``row_index: tuple[name, offset] | None``: Add an integer index - column with given name. - """ + with_columns: list[str] + """Projected columns to return.""" + skip_rows: int + """Rows to skip at the start when reading.""" + n_rows: int + """Number of rows to read after skipping.""" + row_index: tuple[str, int] | None + """If not None add an integer index column of the given name.""" predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if self.typ not in ("csv", "parquet", "ndjson"): # pragma: no cover # This line is unhittable ATM since IPC/Anonymous scan raise # on the polars side raise NotImplementedError(f"Unhandled scan type: {self.typ}") - if self.typ == "ndjson" and self.file_options.n_rows is not None: - raise NotImplementedError("row limit in scan") + if self.typ == "ndjson" and (self.n_rows != -1 or self.skip_rows != 0): + raise NotImplementedError("row limit in scan for json reader") + if self.skip_rows < 0: + # TODO: polars has this implemented for parquet, + # maybe we can do this too? + raise NotImplementedError("slice pushdown for negative slices") + if self.typ == "csv" and self.skip_rows != 0: # pragma: no cover + # This comes from slice pushdown, but that + # optimization doesn't happen right now + raise NotImplementedError("skipping rows in CSV reader") if self.cloud_options is not None and any( self.cloud_options.get(k) is not None for k in ("aws", "azure", "gcp") ): raise NotImplementedError( "Read from cloud storage" ) # pragma: no cover; no test yet + if any(p.startswith("https://") for p in self.paths): + raise NotImplementedError("Read from https") if self.typ == "csv": if self.reader_options["skip_rows_after_header"] != 0: raise NotImplementedError("Skipping rows after header in CSV reader") @@ -242,13 +250,21 @@ def __post_init__(self) -> None: raise NotImplementedError( "ignore_errors is not supported in the JSON reader" ) + elif ( + self.typ == "parquet" + and self.row_index is not None + and self.with_columns is not None + and len(self.with_columns) == 0 + ): + raise NotImplementedError( + "Reading only parquet metadata to produce row index." + ) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - options = self.file_options - with_columns = options.with_columns - row_index = options.row_index - nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1 + with_columns = self.with_columns + row_index = self.row_index + n_rows = self.n_rows if self.typ == "csv": parse_options = self.reader_options["parse_options"] sep = chr(parse_options["separator"]) @@ -256,7 +272,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: eol = chr(parse_options["eol_char"]) if self.reader_options["schema"] is not None: # Reader schema provides names - column_names = list(self.reader_options["schema"]["inner"].keys()) + column_names = list(self.reader_options["schema"]["fields"].keys()) else: # file provides column names column_names = None @@ -282,6 +298,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # polars skips blank lines at the beginning of the file pieces = [] + read_partial = n_rows != -1 for p in self.paths: skiprows = self.reader_options["skip_rows"] path = Path(p) @@ -303,9 +320,13 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: comment=comment, decimal=decimal, dtypes=self.schema, - nrows=nrows, + nrows=n_rows, ) pieces.append(tbl_w_meta) + if read_partial: + n_rows -= tbl_w_meta.tbl.num_rows() + if n_rows <= 0: + break tables, colnames = zip( *( (piece.tbl, piece.column_names(include_children=False)) @@ -321,7 +342,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: tbl_w_meta = plc.io.parquet.read_parquet( plc.io.SourceInfo(self.paths), columns=with_columns, - nrows=nrows, + nrows=n_rows, + skip_rows=self.skip_rows, ) df = DataFrame.from_table( tbl_w_meta.tbl, @@ -354,12 +376,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: raise NotImplementedError( f"Unhandled scan type: {self.typ}" ) # pragma: no cover; post init trips first - if ( - row_index is not None - # TODO: remove condition when dropping support for polars 1.0 - # https://github.com/pola-rs/polars/pull/17363 - and row_index[0] in self.schema - ): + if row_index is not None: name, offset = row_index dtype = self.schema[name] step = plc.interop.from_arrow( @@ -481,36 +498,6 @@ def evaluate( return DataFrame(columns) -def placeholder_column(n: int) -> plc.Column: - """ - Produce a placeholder pylibcudf column with NO BACKING DATA. - - Parameters - ---------- - n - Number of rows the column will advertise - - Returns - ------- - pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER. - - Notes - ----- - This is used to avoid allocating data for count aggregations. - """ - return plc.Column( - plc.DataType(plc.TypeId.INT8), - n, - plc.gpumemoryview( - types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)}) - ), - None, - 0, - 0, - [], - ) - - @dataclasses.dataclass class GroupBy(IR): """Perform a groupby.""" @@ -557,8 +544,7 @@ def check_agg(agg: expr.Expr) -> int: def __post_init__(self) -> None: """Check whether all the aggregations are implemented.""" - if self.options.rolling is None and self.maintain_order: - raise NotImplementedError("Maintaining order in groupby") + super().__post_init__() if self.options.rolling: raise NotImplementedError( "rolling window/groupby" @@ -566,6 +552,8 @@ def __post_init__(self) -> None: if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] + if len(self.keys) == 0: + raise NotImplementedError("dynamic groupby") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -591,7 +579,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: for info in self.agg_infos: for pre_eval, req, rep in info.requests: if pre_eval is None: - col = placeholder_column(df.num_rows) + # A count aggregation, doesn't touch the column, + # but we need to have one. Rather than evaluating + # one, just use one of the key columns. + col = keys[0].obj else: col = pre_eval.evaluate(df).obj requests.append(plc.groupby.GroupByRequest(col, [req])) @@ -611,7 +602,34 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] - return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice) + broadcasted = broadcast(*result_keys, *results) + result_keys = broadcasted[: len(result_keys)] + results = broadcasted[len(result_keys) :] + # Handle order preservation of groups + # like cudf classic does + # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743 + if self.maintain_order and not sorted: + left = plc.stream_compaction.stable_distinct( + plc.Table([k.obj for k in keys]), + list(range(group_keys.num_columns())), + plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + right = plc.Table([key.obj for key in result_keys]) + _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL) + ordered_table = plc.copying.gather( + plc.Table([col.obj for col in broadcasted]), + indices, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + broadcasted = [ + NamedColumn(reordered, b.name) + for reordered, b in zip( + ordered_table.columns(), broadcasted, strict=True + ) + ] + return DataFrame(broadcasted).slice(self.options.slice) @dataclasses.dataclass @@ -627,7 +645,7 @@ class Join(IR): right_on: list[expr.NamedExpr] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"], + Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"], bool, tuple[int, int] | None, str | None, @@ -644,6 +662,7 @@ class Join(IR): def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -653,7 +672,7 @@ def __post_init__(self) -> None: @staticmethod @cache def _joiners( - how: Literal["inner", "left", "full", "leftsemi", "leftanti"], + how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -663,7 +682,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) - elif how == "left": + elif how == "left" or how == "right": return ( plc.join.left_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, @@ -687,8 +706,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - else: - assert_never(how) + assert_never(how) def _reorder_maps( self, @@ -786,8 +804,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: table = plc.copying.gather(left.table, lg, left_policy) result = DataFrame.from_table(table, left.column_names) else: + if how == "right": + # Right join is a left join with the tables swapped + left, right = right, left + left_on, right_on = right_on, left_on lg, rg = join_fn(left_on.table, right_on.table, null_equality) - if how == "left": + if how == "left" or how == "right": # Order of left table is preserved lg, rg = self._reorder_maps( left.num_rows, lg, left_policy, right.num_rows, rg, right_policy @@ -815,6 +837,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) ) right = right.discard_columns(right_on.column_names_set) + if how == "right": + # Undo the swap for right join before gluing together. + left, right = right, left right = right.rename_columns( { name: f"{name}{suffix}" @@ -1065,11 +1090,13 @@ class MapFunction(IR): # "merge_sorted", "rename", "explode", + "unpivot", ] ) def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if self.name not in MapFunction._NAMES: raise NotImplementedError(f"Unhandled map function {self.name}") if self.name == "explode": @@ -1086,6 +1113,22 @@ def __post_init__(self) -> None: set(new) & (set(self.df.schema.keys() - set(old))) ): raise NotImplementedError("Duplicate new names in rename.") + elif self.name == "unpivot": + indices, pivotees, variable_name, value_name = self.options + value_name = "value" if value_name is None else value_name + variable_name = "variable" if variable_name is None else variable_name + if len(pivotees) == 0: + index = frozenset(indices) + pivotees = [name for name in self.df.schema if name not in index] + if not all( + dtypes.can_cast(self.df.schema[p], self.schema[value_name]) + for p in pivotees + ): + raise NotImplementedError( + "Unpivot cannot cast all input columns to " + f"{self.schema[value_name].id()}" + ) + self.options = (indices, pivotees, variable_name, value_name) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -1107,6 +1150,41 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return DataFrame.from_table( plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) + elif self.name == "unpivot": + indices, pivotees, variable_name, value_name = self.options + npiv = len(pivotees) + df = self.df.evaluate(cache=cache) + index_columns = [ + NamedColumn(col, name) + for col, name in zip( + plc.reshape.tile(df.select(indices).table, npiv).columns(), + indices, + strict=True, + ) + ] + (variable_column,) = plc.filling.repeat( + plc.Table( + [ + plc.interop.from_arrow( + pa.array( + pivotees, + type=plc.interop.to_arrow(self.schema[variable_name]), + ), + ) + ] + ), + df.num_rows, + ).columns() + value_column = plc.concatenate.concatenate( + [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns] + ) + return DataFrame( + [ + *index_columns, + NamedColumn(variable_column, variable_name), + NamedColumn(value_column, value_name), + ] + ) else: raise AssertionError("Should never be reached") # pragma: no cover @@ -1122,6 +1200,7 @@ class Union(IR): def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() schema = self.dfs[0].schema if not all(s.schema == schema for s in self.dfs[1:]): raise NotImplementedError("Schema mismatch") diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 6dc97c7cb51..45881afe0c8 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -75,13 +75,12 @@ def _translate_ir( def _( node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.PythonScan( - schema, - node.options, - translate_named_expr(visitor, n=node.predicate) - if node.predicate is not None - else None, + scan_fn, with_columns, source_type, predicate, nrows = node.options + options = (scan_fn, with_columns, source_type, nrows) + predicate = ( + translate_named_expr(visitor, n=predicate) if predicate is not None else None ) + return ir.PythonScan(schema, options, predicate) @_translate_ir.register @@ -94,13 +93,35 @@ def _( cloud_options = None else: reader_options, cloud_options = map(json.loads, options) + if ( + typ == "csv" + and visitor.version()[0] == 1 + and reader_options["schema"] is not None + ): + reader_options["schema"] = { + "fields": reader_options["schema"]["inner"] + } # pragma: no cover; CI tests 1.7 + file_options = node.file_options + with_columns = file_options.with_columns + n_rows = file_options.n_rows + if n_rows is None: + n_rows = -1 # All rows + skip_rows = 0 # Don't skip + else: + # TODO: with versioning, rename on the rust side + skip_rows, n_rows = n_rows + + row_index = file_options.row_index return ir.Scan( schema, typ, reader_options, cloud_options, node.paths, - node.file_options, + with_columns, + skip_rows, + n_rows, + row_index, translate_named_expr(visitor, n=node.predicate) if node.predicate is not None else None, @@ -293,10 +314,28 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: ctx: AbstractContextManager[None] = ( set_node(visitor, n) if n is not None else noop_context ) + # IR is versioned with major.minor, minor is bumped for backwards + # compatible changes (e.g. adding new nodes), major is bumped for + # incompatible changes (e.g. renaming nodes). + # Polars 1.7 changes definition of the CSV reader options schema name. + if (version := visitor.version()) >= (3, 0): + raise NotImplementedError( + f"No support for polars IR {version=}" + ) # pragma: no cover; no such version for now. + with ctx: + polars_schema = visitor.get_schema() node = visitor.view_current_node() - schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()} - return _translate_ir(node, visitor, schema) + schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} + result = _translate_ir(node, visitor, schema) + if any( + isinstance(dtype, pl.Null) + for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) + ): + raise NotImplementedError( + f"No GPU support for {result} with Null column dtype." + ) + return result def translate_named_expr( @@ -345,6 +384,24 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): + if name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = (translate_expr(visitor, n=n) for n in node.input) + if isinstance(chars, expr.Literal): + if chars.value == pa.scalar(""): + # No-op in polars, but libcudf uses empty string + # as signifier to remove whitespace. + return column + elif chars.value == pa.scalar(None): + # Polars uses None to mean "strip all whitespace" + chars = expr.Literal( + column.dtype, + pa.scalar("", type=plc.interop.to_arrow(column.dtype)), + ) + return expr.StringFunction(dtype, name, options, column, chars) return expr.StringFunction( dtype, name, @@ -369,19 +426,43 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex *(translate_expr(visitor, n=n) for n in node.input), ) elif isinstance(name, pl_expr.TemporalFunction): - return expr.TemporalFunction( + # functions for which evaluation of the expression may not return + # the same dtype as polars, either due to libcudf returning a different + # dtype, or due to our internal processing affecting what libcudf returns + needs_cast = { + pl_expr.TemporalFunction.Year, + pl_expr.TemporalFunction.Month, + pl_expr.TemporalFunction.Day, + pl_expr.TemporalFunction.WeekDay, + pl_expr.TemporalFunction.Hour, + pl_expr.TemporalFunction.Minute, + pl_expr.TemporalFunction.Second, + pl_expr.TemporalFunction.Millisecond, + } + result_expr = expr.TemporalFunction( dtype, name, options, *(translate_expr(visitor, n=n) for n in node.input), ) + if name in needs_cast: + return expr.Cast(dtype, result_expr) + return result_expr + elif isinstance(name, str): - return expr.UnaryFunction( - dtype, - name, - options, - *(translate_expr(visitor, n=n) for n in node.input), - ) + children = (translate_expr(visitor, n=n) for n in node.input) + if name == "log": + (base,) = options + (child,) = children + return expr.BinOp( + dtype, + plc.binaryop.BinaryOperator.LOG_BASE, + child, + expr.Literal(dtype, pa.scalar(base, type=plc.interop.to_arrow(dtype))), + ) + elif name == "pow": + return expr.BinOp(dtype, plc.binaryop.BinaryOperator.POW, *children) + return expr.UnaryFunction(dtype, name, options, *children) raise NotImplementedError( f"No handler for Expr function node with {name=}" ) # pragma: no cover; polars raises on the rust side for now diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index d37c96a15de..a79d45899cd 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -5,12 +5,11 @@ from __future__ import annotations -from functools import partial from typing import TYPE_CHECKING +from polars import GPUEngine from polars.testing.asserts import assert_frame_equal -from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: @@ -77,21 +76,13 @@ def assert_gpu_result_equal( NotImplementedError If GPU collection failed in some way. """ - if collect_kwargs is None: - collect_kwargs = {} - final_polars_collect_kwargs = collect_kwargs.copy() - final_cudf_collect_kwargs = collect_kwargs.copy() - if polars_collect_kwargs is not None: - final_polars_collect_kwargs.update(polars_collect_kwargs) - if cudf_collect_kwargs is not None: # pragma: no cover - # exclude from coverage since not used ATM - # but this is probably still useful - final_cudf_collect_kwargs.update(cudf_collect_kwargs) - expect = lazydf.collect(**final_polars_collect_kwargs) - got = lazydf.collect( - **final_cudf_collect_kwargs, - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True), + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( + collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs ) + + expect = lazydf.collect(**final_polars_collect_kwargs) + engine = GPUEngine(raise_on_fail=True) + got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine) assert_frame_equal( expect, got, @@ -134,3 +125,94 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e else: raise AssertionError(f"Translation DID NOT RAISE {exceptions}") + + +def _process_kwargs( + collect_kwargs: dict[OptimizationArgs, bool] | None, + polars_collect_kwargs: dict[OptimizationArgs, bool] | None, + cudf_collect_kwargs: dict[OptimizationArgs, bool] | None, +) -> tuple[dict[OptimizationArgs, bool], dict[OptimizationArgs, bool]]: + if collect_kwargs is None: + collect_kwargs = {} + final_polars_collect_kwargs = collect_kwargs.copy() + final_cudf_collect_kwargs = collect_kwargs.copy() + if polars_collect_kwargs is not None: # pragma: no cover; not currently used + final_polars_collect_kwargs.update(polars_collect_kwargs) + if cudf_collect_kwargs is not None: # pragma: no cover; not currently used + final_cudf_collect_kwargs.update(cudf_collect_kwargs) + return final_polars_collect_kwargs, final_cudf_collect_kwargs + + +def assert_collect_raises( + lazydf: pl.LazyFrame, + *, + polars_except: type[Exception] | tuple[type[Exception], ...], + cudf_except: type[Exception] | tuple[type[Exception], ...], + collect_kwargs: dict[OptimizationArgs, bool] | None = None, + polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None, + cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None, +): + """ + Assert that collecting the result of a query raises the expected exceptions. + + Parameters + ---------- + lazydf + frame to collect. + collect_kwargs + Common keyword arguments to pass to collect for both polars CPU and + cudf-polars. + Useful for controlling optimization settings. + polars_except + Exception or exceptions polars CPU is expected to raise. + cudf_except + Exception or exceptions polars GPU is expected to raise. + collect_kwargs + Common keyword arguments to pass to collect for both polars CPU and + cudf-polars. + Useful for controlling optimization settings. + polars_collect_kwargs + Keyword arguments to pass to collect for execution on polars CPU. + Overrides kwargs in collect_kwargs. + Useful for controlling optimization settings. + cudf_collect_kwargs + Keyword arguments to pass to collect for execution on cudf-polars. + Overrides kwargs in collect_kwargs. + Useful for controlling optimization settings. + + Returns + ------- + None + If both sides raise the expected exceptions. + + Raises + ------ + AssertionError + If either side did not raise the expected exceptions. + """ + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( + collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs + ) + + try: + lazydf.collect(**final_polars_collect_kwargs) + except polars_except: + pass + except Exception as e: + raise AssertionError( + f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}" + ) from e + else: + raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}") + + engine = GPUEngine(raise_on_fail=True) + try: + lazydf.collect(**final_cudf_collect_kwargs, engine=engine) + except cudf_except: + pass + except Exception as e: + raise AssertionError( + f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}" + ) from e + else: + raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}") diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py new file mode 100644 index 00000000000..c40d59e6d33 --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -0,0 +1,154 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Plugin for running polars test suite setting GPU engine as default.""" + +from __future__ import annotations + +from functools import partialmethod +from typing import TYPE_CHECKING + +import pytest + +import polars + +if TYPE_CHECKING: + from collections.abc import Mapping + + +def pytest_addoption(parser: pytest.Parser): + """Add plugin-specific options.""" + group = parser.getgroup( + "cudf-polars", "Plugin to set GPU as default engine for polars tests" + ) + group.addoption( + "--cudf-polars-no-fallback", + action="store_true", + help="Turn off fallback to CPU when running tests (default use fallback)", + ) + + +def pytest_configure(config: pytest.Config): + """Enable use of this module as a pytest plugin to enable GPU collection.""" + no_fallback = config.getoption("--cudf-polars-no-fallback") + collect = polars.LazyFrame.collect + engine = polars.GPUEngine(raise_on_fail=no_fallback) + polars.LazyFrame.collect = partialmethod(collect, engine=engine) + config.addinivalue_line( + "filterwarnings", + "ignore:.*GPU engine does not support streaming or background collection", + ) + config.addinivalue_line( + "filterwarnings", + "ignore:.*Query execution with GPU not supported", + ) + + +EXPECTED_FAILURES: Mapping[str, str] = { + "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", + "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", + "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read", + "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", + "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", + "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", + "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", + "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", + "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", + "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", + "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU", + "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match", + "tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported", + "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero", + "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", + "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", + "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input14-expected14-input_dtype14-output_dtype14]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", + "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", + "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", + "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv", + "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception", + "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", + "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", + "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", + # Maybe flaky, order-dependent? + "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", + "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", +} + + +def pytest_collection_modifyitems( + session: pytest.Session, config: pytest.Config, items: list[pytest.Item] +): + """Mark known failing tests.""" + if config.getoption("--cudf-polars-no-fallback"): + # Don't xfail tests if running without fallback + return + for item in items: + if item.nodeid in EXPECTED_FAILURES: + item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index adab10bdded..240b11bdf59 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -84,6 +84,10 @@ def view_expression(self, n: int) -> Expr: """Convert the given expression to python rep.""" ... + def version(self) -> tuple[int, int]: + """The IR version as `(major, minor)`.""" + ... + def set_udf( self, callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame], diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 7f6ea1edfd9..4154a404e98 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -13,7 +13,7 @@ import polars as pl -__all__ = ["from_polars", "downcast_arrow_lists"] +__all__ = ["from_polars", "downcast_arrow_lists", "can_cast"] def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: @@ -45,6 +45,28 @@ def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: return typ +def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: + """ + Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes. + + Parameters + ---------- + from_ + Source datatype + to + Target datatype + + Returns + ------- + True if casting is supported, False otherwise + """ + return ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) + + @cache def from_polars(dtype: pl.DataType) -> plc.DataType: """ diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 9807cffb384..2e6efde968c 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -12,18 +12,11 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0") -POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1") -POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2") -POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1") -POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0") -POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1") -POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2") - -POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2") -POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1") -POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2") -POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1") - -if POLARS_VERSION < parse("1.0"): # pragma: no cover - raise ImportError("cudf_polars requires py-polars v1.0 or greater.") +POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6") +POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6") +POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6") + +if POLARS_VERSION_LT_16: + raise ImportError( + "cudf_polars requires py-polars v1.6 or greater." + ) # pragma: no cover diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 6cd36136bf8..bff44af1468 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -8,15 +8,17 @@ You will need: preferred configuration. Or else, use [rustup](https://www.rust-lang.org/tools/install) 2. A [cudf development - environment](https://github.com/rapidsai/cudf/blob/branch-24.10/CONTRIBUTING.md#setting-up-your-build-environment). + environment](https://github.com/rapidsai/cudf/blob/branch-24.12/CONTRIBUTING.md#setting-up-your-build-environment). The combined devcontainer works, or whatever your favourite approach is. > ![NOTE] These instructions will get simpler as we merge code in. ## Installing polars -We will need to build polars from source. Until things settle down, -live at `HEAD`. +`cudf-polars` works with polars >= 1.3, as long as the internal IR +version doesn't get a major version bump. So `pip install polars>=1.3` +should work. For development, if we're adding things to the polars +side of things, we will need to build polars from source: ```sh git clone https://github.com/pola-rs/polars @@ -59,7 +61,7 @@ The executor for the polars logical plan lives in the cudf repo, in ```sh cd cudf/python/cudf_polars -uv pip install --no-build-isolation --no-deps -e . +pip install --no-build-isolation --no-deps -e . ``` You should now be able to run the tests in the `cudf_polars` package: @@ -69,16 +71,18 @@ pytest -v tests # Executor design -The polars `LazyFrame.collect` functionality offers a -"post-optimization" callback that may be used by a third party library -to replace a node (or more, though we only replace a single node) in the -optimized logical plan with a Python callback that is to deliver the -result of evaluating the plan. This splits the execution of the plan -into two phases. First, a symbolic phase which translates to our -internal representation (IR). Second, an execution phase which executes -using our IR. - -The translation phase receives the a low-level Rust `NodeTraverse` +The polars `LazyFrame.collect` functionality offers configuration of +the engine to use for collection through the `engine` argument. At a +low level, this provides for configuration of a "post-optimization" +callback that may be used by a third party library to replace a node +(or more, though we only replace a single node) in the optimized +logical plan with a Python callback that is to deliver the result of +evaluating the plan. This splits the execution of the plan into two +phases. First, a symbolic phase which translates to our internal +representation (IR). Second, an execution phase which executes using +our IR. + +The translation phase receives the a low-level Rust `NodeTraverser` object which delivers Python representations of the plan nodes (and expressions) one at a time. During translation, we endeavour to raise `NotImplementedError` for any unsupported functionality. This way, if @@ -86,33 +90,60 @@ we can't execute something, we just don't modify the logical plan at all: if we can translate the IR, it is assumed that evaluation will later succeed. -The usage of the cudf-based executor is therefore, at present: +The usage of the cudf-based executor is therefore selected with the +gpu engine: ```python -from cudf_polars.callback import execute_with_cudf +import polars as pl -result = q.collect(post_opt_callback=execute_with_cudf) +result = q.collect(engine="gpu") ``` This should either transparently run on the GPU and deliver a polars dataframe, or else fail (but be handled) and just run the normal CPU -execution. +execution. If `POLARS_VERBOSE` is true, then fallback is logged with a +`PerformanceWarning`. -If you want to fail during translation, set the keyword argument -`raise_on_fail` to `True`: +As well as a string argument, the engine can also be specified with a +polars `GPUEngine` object. This allows passing more configuration in. +Currently, the public properties are `device`, to select the device, +and `memory_resource`, to select the RMM memory resource used for +allocations during the collection phase. +For example: ```python -from functools import partial -from cudf_polars.callback import execute_with_cudf +import polars as pl -result = q.collect( - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) -) +result = q.collect(engine=pl.GPUEngine(device=1, memory_resource=mr)) +``` + +Uses device-1, and the given memory resource. Note that the memory +resource provided _must_ be valid for allocations on the specified +device, no checking is performed. + +For debugging purposes, we can also pass undocumented keyword +arguments, at the moment, `raise_on_fail` is also supported, which +raises, rather than falling back, during translation: + +```python + +result = q.collect(engine=pl.GPUEngine(raise_on_fail=True)) ``` This is mostly useful when writing tests, since in that case we want any failures to propagate, rather than falling back to the CPU mode. +## IR versioning + +On the polars side, the `NodeTraverser` object advertises an internal +version (via `NodeTraverser.version()` as a `(major, minor)` tuple). +`minor` version bumps are for backwards compatible changes (e.g. +exposing new nodes), whereas `major` bumps are for incompatible +changes. We can therefore attempt to detect the IR version +(independently of the polars version) and dispatch, or error +appropriately. This should be done during IR translation in +`translate.py`. + ## Adding a handler for a new plan node Plan node definitions live in `cudf_polars/dsl/ir.py`, these are @@ -175,7 +206,7 @@ around their pylibcudf counterparts. We have four (in 1. `Scalar` (a wrapper around a pylibcudf `Scalar`) 2. `Column` (a wrapper around a pylibcudf `Column`) -3. `NamedColumn` a `Column` with an additional name +3. `NamedColumn` (a `Column` with an additional name) 4. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 984b5487b98..df70dc5dada 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,8 +19,8 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.0,<1.3", - "pylibcudf==24.10.*,>=0.0.0a0", + "polars>=1.6", + "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -58,6 +58,9 @@ exclude_also = [ "class .*\\bProtocol\\):", "assert_never\\(" ] +# The cudf_polars test suite doesn't exercise the plugin, so we omit +# it from coverage checks. +omit = ["cudf_polars/testing/plugin.py"] [tool.ruff] line-length = 88 diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 6b470268084..39fb44d55a5 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -9,6 +9,7 @@ import polars as pl from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.testing.asserts import assert_gpu_result_equal def test_select_missing_raises(): @@ -140,3 +141,13 @@ def test_sorted_flags_preserved(with_nulls, nulls_last): assert b.null_order == b_null_order assert c.is_sorted == plc.types.Sorted.NO assert df.flags == gf.to_polars().flags + + +def test_empty_name_roundtrips_overlap(): + df = pl.LazyFrame({"": [1, 2, 3], "column_0": [4, 5, 6]}) + assert_gpu_result_equal(df) + + +def test_empty_name_roundtrips_no_overlap(): + df = pl.LazyFrame({"": [1, 2, 3], "b": [4, 5, 6]}) + assert_gpu_result_equal(df) diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 245bde3acab..56055f4c6c2 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -7,15 +7,38 @@ import polars as pl from cudf_polars.dsl import expr -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) -@pytest.fixture(params=sorted(expr.Agg._SUPPORTED)) +@pytest.fixture( + params=[ + # regular aggs from Agg + "min", + "max", + "median", + "n_unique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + # scan aggs from UnaryFunction + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + ] +) def agg(request): return request.param -@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16]) +@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16, pl.Int8, pl.UInt16]) def dtype(request): return request.param @@ -34,6 +57,11 @@ def df(dtype, with_nulls, is_sorted): if is_sorted: values = sorted(values, key=lambda x: -1000 if x is None else x) + if dtype.is_unsigned_integer(): + values = pl.Series(values).abs() + if is_sorted: + values = values.sort() + df = pl.LazyFrame({"a": values}, schema={"a": dtype}) if is_sorted: return df.set_sorted("a") @@ -52,6 +80,51 @@ def test_agg(df, agg): assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) +def test_bool_agg(agg, request): + if agg == "cum_min" or agg == "cum_max": + pytest.skip("Does not apply") + request.applymarker( + pytest.mark.xfail( + condition=agg == "n_unique", + reason="Wrong dtype we get Int32, polars gets UInt32", + ) + ) + df = pl.LazyFrame({"a": [True, False, None, True]}) + expr = getattr(pl.col("a"), agg)() + q = df.select(expr) + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs) +def test_cum_agg_reverse_unsupported(cum_agg): + df = pl.LazyFrame({"a": [1, 2, 3]}) + expr = getattr(pl.col("a"), cum_agg)(reverse=True) + q = df.select(expr) + + assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("q", [0.5, pl.lit(0.5)]) +@pytest.mark.parametrize("interp", ["nearest", "higher", "lower", "midpoint", "linear"]) +def test_quantile(df, q, interp): + expr = pl.col("a").quantile(q, interp) + q = df.select(expr) + + # https://github.com/rapidsai/cudf/issues/15852 + check_dtypes = q.collect_schema()["a"] == pl.Float64 + if not check_dtypes: + with pytest.raises(AssertionError): + assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) + + +def test_quantile_invalid_q(df): + expr = pl.col("a").quantile(pl.col("a")) + q = df.select(expr) + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max] ) diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py index 97421008669..2347021c40e 100644 --- a/python/cudf_polars/tests/expressions/test_booleanfunction.py +++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py @@ -17,15 +17,11 @@ def has_nulls(request): return request.param -@pytest.mark.parametrize( - "ignore_nulls", - [ - pytest.param( - False, marks=pytest.mark.xfail(reason="No support for Kleene logic") - ), - True, - ], -) +@pytest.fixture(params=[False, True], ids=["include_nulls", "ignore_nulls"]) +def ignore_nulls(request): + return request.param + + def test_booleanfunction_reduction(ignore_nulls): ldf = pl.LazyFrame( { @@ -43,6 +39,25 @@ def test_booleanfunction_reduction(ignore_nulls): assert_gpu_result_equal(query) +@pytest.mark.parametrize("expr", [pl.Expr.any, pl.Expr.all]) +def test_booleanfunction_all_any_kleene(expr, ignore_nulls): + ldf = pl.LazyFrame( + { + "a": [False, None], + "b": [False, False], + "c": [False, True], + "d": [None, False], + "e": pl.Series([None, None], dtype=pl.Boolean()), + "f": [None, True], + "g": [True, False], + "h": [True, None], + "i": [True, True], + } + ) + q = ldf.select(expr(pl.col("*"), ignore_nulls=ignore_nulls)) + assert_gpu_result_equal(q) + + @pytest.mark.parametrize( "expr", [ @@ -54,14 +69,7 @@ def test_booleanfunction_reduction(ignore_nulls): ids=lambda f: f"{f.__name__}()", ) @pytest.mark.parametrize("has_nans", [False, True], ids=["no_nans", "nans"]) -def test_boolean_function_unary(request, expr, has_nans, has_nulls): - if has_nulls and expr in (pl.Expr.is_nan, pl.Expr.is_not_nan): - request.applymarker( - pytest.mark.xfail( - reason="Need to copy null mask since is_{not_}nan(null) => null" - ) - ) - +def test_boolean_function_unary(expr, has_nans, has_nulls): values: list[float | None] = [1, 2, 3, 4, 5] if has_nans: values[3] = float("nan") @@ -119,9 +127,7 @@ def test_boolean_isbetween(closed, bounds): "expr", [pl.any_horizontal("*"), pl.all_horizontal("*")], ids=["any", "all"] ) @pytest.mark.parametrize("wide", [False, True], ids=["narrow", "wide"]) -def test_boolean_horizontal(request, expr, has_nulls, wide): - if has_nulls: - request.applymarker(pytest.mark.xfail(reason="No support for Kleene logic")) +def test_boolean_horizontal(expr, has_nulls, wide): ldf = pl.LazyFrame( { "a": [False, False, False, False, False, True], @@ -164,6 +170,18 @@ def test_boolean_is_in(expr): assert_gpu_result_equal(q) +@pytest.mark.parametrize("expr", [pl.Expr.and_, pl.Expr.or_, pl.Expr.xor]) +def test_boolean_kleene_logic(expr): + ldf = pl.LazyFrame( + { + "a": [False, False, False, None, None, None, True, True, True], + "b": [False, None, True, False, None, True, False, None, True], + } + ) + q = ldf.select(expr(pl.col("a"), pl.col("b"))) + assert_gpu_result_equal(q) + + def test_boolean_is_in_raises_unsupported(): ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)}) q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32()))) diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 218101bf87c..c6ea29ddd38 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -9,7 +9,11 @@ import polars as pl -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.dsl.expr import TemporalFunction +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.mark.parametrize( @@ -37,26 +41,97 @@ def test_datetime_dataframe_scan(dtype): assert_gpu_result_equal(query) +datetime_extract_fields = [ + "year", + "month", + "day", + "weekday", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + + +@pytest.fixture( + ids=datetime_extract_fields, + params=[methodcaller(f) for f in datetime_extract_fields], +) +def field(request): + return request.param + + +def test_datetime_extract(field): + ldf = pl.LazyFrame( + { + "datetimes": pl.datetime_range( + datetime.datetime(2020, 1, 1), + datetime.datetime(2021, 12, 30), + "3mo14h15s11ms33us999ns", + eager=True, + ) + } + ) + + q = ldf.select(field(pl.col("datetimes").dt)) + + assert_gpu_result_equal(q) + + +def test_datetime_extra_unsupported(monkeypatch): + ldf = pl.LazyFrame( + { + "datetimes": pl.datetime_range( + datetime.datetime(2020, 1, 1), + datetime.datetime(2021, 12, 30), + "3mo14h15s11ms33us999ns", + eager=True, + ) + } + ) + + def unsupported_name_setter(self, value): + pass + + def unsupported_name_getter(self): + return "unsupported" + + monkeypatch.setattr( + TemporalFunction, + "name", + property(unsupported_name_getter, unsupported_name_setter), + ) + + q = ldf.select(pl.col("datetimes").dt.nanosecond()) + + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "field", [ methodcaller("year"), - pytest.param( - methodcaller("day"), - marks=pytest.mark.xfail(reason="day extraction not implemented"), - ), + methodcaller("month"), + methodcaller("day"), + methodcaller("weekday"), ], ) -def test_datetime_extract(field): +def test_date_extract(field): + ldf = pl.LazyFrame( + { + "dates": [ + datetime.date(2024, 1, 1), + datetime.date(2024, 10, 11), + ] + } + ) + ldf = pl.LazyFrame( {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]} ) - q = ldf.select(field(pl.col("dates").dt)) - with pytest.raises(AssertionError): - # polars produces int32, libcudf produces int16 for the year extraction - # libcudf can lose data here. - # https://github.com/rapidsai/cudf/issues/16196 - assert_gpu_result_equal(q) + q = ldf.select(field(pl.col("dates").dt)) - assert_gpu_result_equal(q, check_dtypes=False) + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py index 6bffa3e252c..f7c5d1bf2cd 100644 --- a/python/cudf_polars/tests/expressions/test_gather.py +++ b/python/cudf_polars/tests/expressions/test_gather.py @@ -6,7 +6,6 @@ import polars as pl -from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -47,4 +46,4 @@ def test_gather_out_of_bounds(negative): query = ldf.select(pl.col("a").gather(pl.col("b"))) with pytest.raises(pl.exceptions.ComputeError): - query.collect(post_opt_callback=execute_with_cudf) + query.collect(engine="gpu") diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py new file mode 100644 index 00000000000..ac3aecf88e6 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import numpy as np +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture( + params=[ + "sin", + "cos", + "tan", + "arcsin", + "arccos", + "arctan", + "sinh", + "cosh", + "tanh", + "arcsinh", + "arccosh", + "arctanh", + "exp", + "sqrt", + "cbrt", + "ceil", + "floor", + "abs", + ] +) +def op(request): + return request.param + + +@pytest.fixture(params=[pl.Int32, pl.Float32]) +def dtype(request): + return request.param + + +@pytest.fixture +def ldf(with_nulls, dtype): + values = [1, 2, 4, 5, -2, -4, 0] + if with_nulls: + values.append(None) + if dtype == pl.Float32: + values.append(-float("inf")) + values.append(float("nan")) + values.append(float("inf")) + elif dtype == pl.Int32: + iinfo = np.iinfo("int32") + values.append(iinfo.min) + values.append(iinfo.max) + return pl.LazyFrame( + { + "a": pl.Series(values, dtype=dtype), + "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32), + } + ) + + +def test_unary(ldf, op): + expr = getattr(pl.col("a"), op)() + q = ldf.select(expr) + assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("base_literal", [False, True]) +@pytest.mark.parametrize("exponent_literal", [False, True]) +def test_pow(ldf, base_literal, exponent_literal): + base = pl.lit(2) if base_literal else pl.col("a") + exponent = pl.lit(-3, dtype=pl.Float32) if exponent_literal else pl.col("b") + + q = ldf.select(base.pow(exponent)) + + assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("natural", [True, False]) +def test_log(ldf, natural): + if natural: + expr = pl.col("a").log() + else: + expr = pl.col("a").log(10) + + q = ldf.select(expr) + + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index df08e15baa4..4f6850ac977 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -10,6 +10,7 @@ from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -152,3 +153,187 @@ def test_slice_column(slice_column_data): else: query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) assert_ir_translation_raises(query, NotImplementedError) + + +@pytest.fixture +def to_datetime_data(): + return pl.LazyFrame( + { + "a": [ + "2021-01-01", + "2021-01-02", + "abcd", + ] + } + ) + + +@pytest.mark.parametrize("cache", [True, False], ids=lambda cache: f"{cache=}") +@pytest.mark.parametrize("strict", [True, False], ids=lambda strict: f"{strict=}") +@pytest.mark.parametrize("exact", [True, False], ids=lambda exact: f"{exact=}") +@pytest.mark.parametrize("format", ["%Y-%m-%d", None], ids=lambda format: f"{format=}") +def test_to_datetime(to_datetime_data, cache, strict, format, exact): + query = to_datetime_data.select( + pl.col("a").str.strptime( + pl.Datetime("ns"), format=format, cache=cache, strict=strict, exact=exact + ) + ) + if cache or format is None or not exact: + assert_ir_translation_raises(query, NotImplementedError) + elif strict: + assert_collect_raises( + query, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.ComputeError, + ) + else: + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "target, repl", + [("a", "a"), ("Wı", "☺"), ("FG", ""), ("doesnotexist", "blahblah")], # noqa: RUF001 +) +@pytest.mark.parametrize("n", [0, 3, -1]) +def test_replace_literal(ldf, target, repl, n): + query = ldf.select(pl.col("a").str.replace(target, repl, literal=True, n=n)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("target, repl", [("", ""), ("a", pl.col("a"))]) +def test_replace_literal_unsupported(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace(target, repl, literal=True)) + assert_ir_translation_raises(query, NotImplementedError) + + +def test_replace_re(ldf): + query = ldf.select(pl.col("a").str.replace("A", "a", literal=False)) + assert_ir_translation_raises(query, NotImplementedError) + + +@pytest.mark.parametrize( + "target,repl", + [ + (["A", "de", "kLm", "awef"], "a"), + (["A", "de", "kLm", "awef"], ""), + (["A", "de", "kLm", "awef"], ["a", "b", "c", "d"]), + (["A", "de", "kLm", "awef"], ["a", "b", "c", ""]), + ( + pl.lit(pl.Series(["A", "de", "kLm", "awef"])), + pl.lit(pl.Series(["a", "b", "c", "d"])), + ), + ], +) +def test_replace_many(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace_many(target, repl)) + + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "target,repl", + [(["A", ""], ["a", "b"]), (pl.col("a").drop_nulls(), pl.col("a").drop_nulls())], +) +def test_replace_many_notimplemented(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace_many(target, repl)) + assert_ir_translation_raises(query, NotImplementedError) + + +def test_replace_many_ascii_case(ldf): + query = ldf.select( + pl.col("a").str.replace_many(["a", "b", "c"], "a", ascii_case_insensitive=True) + ) + + assert_ir_translation_raises(query, NotImplementedError) + + +_strip_data = [ + "AbC", + "123abc", + "", + " ", + None, + "aAaaaAAaa", + " ab c ", + "abc123", + " ", + "\tabc\t", + "\nabc\n", + "\r\nabc\r\n", + "\t\n abc \n\t", + "!@#$%^&*()", + " abc!!! ", + " abc\t\n!!! ", + "__abc__", + "abc\n\n", + "123abc456", + "abcxyzabc", +] + +strip_chars = [ + "a", + "", + " ", + "\t", + "\n", + "\r\n", + "!", + "@#", + "123", + "xyz", + "abc", + "__", + " \t\n", + "abc123", + None, +] + + +@pytest.fixture +def strip_ldf(): + return pl.DataFrame({"a": _strip_data}).lazy() + + +@pytest.fixture(params=strip_chars) +def to_strip(request): + return request.param + + +def test_strip_chars(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_start(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars_start(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_end(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars_end(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_column(strip_ldf): + q = strip_ldf.select(pl.col("a").str.strip_chars(pl.col("a"))) + assert_ir_translation_raises(q, NotImplementedError) + + +def test_invalid_regex_raises(): + df = pl.LazyFrame({"a": ["abc"]}) + + q = df.select(pl.col("a").str.contains(r"ab)", strict=True)) + + assert_collect_raises( + q, + polars_except=pl.exceptions.ComputeError, + cudf_except=pl.exceptions.ComputeError, + ) + + +@pytest.mark.parametrize("pattern", ["a{1000}", "a(?i:B)"]) +def test_unsupported_regex_raises(pattern): + df = pl.LazyFrame({"a": ["abc"]}) + + q = df.select(pl.col("a").str.contains(pattern, strict=True)) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_polars/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 5b4bba55552..3c3986be19b 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -6,6 +6,9 @@ import pytest import polars as pl +from polars.testing.asserts import assert_frame_equal + +import rmm from cudf_polars.dsl.ir import IR from cudf_polars.testing.asserts import ( @@ -32,3 +35,48 @@ def raise_unimplemented(self): ): # And ensure that collecting issues the correct warning. assert_gpu_result_equal(q) + + +def test_unsupported_config_raises(): + q = pl.LazyFrame({}) + + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(unknown_key=True)) + + +@pytest.mark.parametrize("device", [-1, "foo"]) +def test_invalid_device_raises(device): + q = pl.LazyFrame({}) + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(device=device)) + + +@pytest.mark.parametrize("mr", [1, object()]) +def test_invalid_memory_resource_raises(mr): + q = pl.LazyFrame({}) + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(memory_resource=mr)) + + +def test_explicit_device_zero(): + q = pl.LazyFrame({"a": [1, 2, 3]}) + + result = q.collect(engine=pl.GPUEngine(device=0)) + assert_frame_equal(q.collect(), result) + + +def test_explicit_memory_resource(): + upstream = rmm.mr.CudaMemoryResource() + n_allocations = 0 + + def allocate(bytes, stream): + nonlocal n_allocations + n_allocations += 1 + return upstream.allocate(bytes, stream) + + mr = rmm.mr.CallbackMemoryResource(allocate, upstream.deallocate) + + q = pl.LazyFrame({"a": [1, 2, 3]}) + result = q.collect(engine=pl.GPUEngine(memory_resource=mr)) + assert_frame_equal(q.collect(), result) + assert n_allocations > 0 diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index a75825ef3d3..6f996e0e0ec 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -12,7 +12,6 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils import versions @pytest.fixture @@ -31,6 +30,7 @@ def df(): params=[ [pl.col("key1")], [pl.col("key2")], + [pl.col("key1"), pl.lit(1)], [pl.col("key1") * pl.col("key2")], [pl.col("key1"), pl.col("key2")], [pl.col("key1") == pl.col("key2")], @@ -52,6 +52,7 @@ def keys(request): [(pl.col("float") - pl.lit(2)).max()], [pl.col("float").sum().round(decimals=1)], [pl.col("float").round(decimals=1).sum()], + [pl.col("int").first(), pl.col("float").last()], ], ids=lambda aggs: "-".join(map(str, aggs)), ) @@ -60,15 +61,7 @@ def exprs(request): @pytest.fixture( - params=[ - False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="Maintaining order in groupby not implemented" - ), - ), - ], + params=[False, True], ids=["no_maintain_order", "maintain_order"], ) def maintain_order(request): @@ -98,15 +91,10 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs): # Multiple keys don't do sorting qsorted = q.sort(*sort_keys) if len(keys) > 1: - with pytest.raises(AssertionError): - # https://github.com/pola-rs/polars/issues/17556 - assert_gpu_result_equal(q, check_exact=False) - if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean(): - # https://github.com/pola-rs/polars/issues/17557 - with pytest.raises(AssertionError): - assert_gpu_result_equal(qsorted, check_exact=False) - else: - assert_gpu_result_equal(qsorted, check_exact=False) + # https://github.com/pola-rs/polars/issues/17556 + # Can't assert that the query without post-sorting fails, + # since it _might_ pass. + assert_gpu_result_equal(qsorted, check_exact=False) elif schema[sort_keys[0]] == pl.Boolean(): # Boolean keys don't do sorting, so we get random order assert_gpu_result_equal(qsorted, check_exact=False) @@ -133,6 +121,21 @@ def test_groupby_unsupported(df, expr): assert_ir_translation_raises(q, NotImplementedError) +def test_groupby_null_keys(maintain_order): + df = pl.LazyFrame( + { + "key": pl.Series([1, float("nan"), 2, None, 2, None], dtype=pl.Float64()), + "value": [-1, 2, 1, 2, 3, 4], + } + ) + + q = df.group_by("key", maintain_order=maintain_order).agg(pl.col("value").min()) + if not maintain_order: + q = q.sort("key") + + assert_gpu_result_equal(q) + + @pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513") def test_groupby_minmax_with_nan(): df = pl.LazyFrame( @@ -159,15 +162,7 @@ def test_groupby_nan_minmax_raises(op): @pytest.mark.parametrize( "key", - [ - pytest.param( - 1, - marks=pytest.mark.xfail( - versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this" - ), - ), - pl.col("key1"), - ], + [1, pl.col("key1")], ) @pytest.mark.parametrize( "expr", @@ -183,3 +178,12 @@ def test_groupby_literal_in_agg(df, key, expr): # so just sort by the group key q = df.group_by(key).agg(expr).sort(key, maintain_order=True) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "expr", + [pl.col("int").unique(), pl.col("int").drop_nulls(), pl.col("int").cum_max()], +) +def test_groupby_unary_non_pointwise_raises(df, expr): + q = df.group_by("key1").agg(expr) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_groupby_dynamic.py b/python/cudf_polars/tests/test_groupby_dynamic.py new file mode 100644 index 00000000000..38b3ce74ac5 --- /dev/null +++ b/python/cudf_polars/tests/test_groupby_dynamic.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from datetime import datetime + +import polars as pl + +from cudf_polars.testing.asserts import assert_ir_translation_raises + + +def test_groupby_dynamic_raises(): + df = pl.LazyFrame( + { + "dt": [ + datetime(2021, 12, 31, 0, 0, 0), + datetime(2022, 1, 1, 0, 0, 1), + datetime(2022, 3, 31, 0, 0, 1), + datetime(2022, 4, 1, 0, 0, 1), + ] + } + ) + + q = ( + df.sort("dt") + .group_by_dynamic("dt", every="1q") + .agg(pl.col("dt").count().alias("num_values")) + ) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 1e880cdc6de..7d9ec98db97 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -17,7 +17,7 @@ def join_nulls(request): return request.param -@pytest.fixture(params=["inner", "left", "semi", "anti", "full"]) +@pytest.fixture(params=["inner", "left", "right", "semi", "anti", "full"]) def how(request): return request.param diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index 77032108e6f..e895f27f637 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -61,3 +61,48 @@ def test_rename_columns(mapping): q = df.rename(mapping) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("index", [None, ["a"], ["d", "a"]]) +@pytest.mark.parametrize("variable_name", [None, "names"]) +@pytest.mark.parametrize("value_name", [None, "unpivoted"]) +def test_unpivot(index, variable_name, value_name): + df = pl.LazyFrame( + { + "a": ["x", "y", "z"], + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot( + ["c", "b"], index=index, variable_name=variable_name, value_name=value_name + ) + + assert_gpu_result_equal(q) + + +def test_unpivot_defaults(): + df = pl.LazyFrame( + { + "a": pl.Series([11, 12, 13], dtype=pl.UInt16), + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot(index="d") + assert_gpu_result_equal(q) + + +def test_unpivot_unsupported_cast_raises(): + df = pl.LazyFrame( + { + "a": ["x", "y", "z"], + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + } + ) + + q = df.unpivot(["a", "b"]) + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py index fd8453b77c4..0cda89474a8 100644 --- a/python/cudf_polars/tests/test_python_scan.py +++ b/python/cudf_polars/tests/test_python_scan.py @@ -8,7 +8,9 @@ def test_python_scan(): - def source(with_columns, predicate, nrows): + def source(with_columns, predicate, nrows, *batch_size): + # PythonScan interface changes between 1.3 and 1.4 to add an + # extra batch_size argument return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())}) q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 64acbb076ed..792b136acd8 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -12,7 +12,6 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils import versions @pytest.fixture( @@ -58,6 +57,22 @@ def mask(request): return request.param +@pytest.fixture( + params=[ + None, + (1, 1), + ], + ids=[ + "no-slice", + "slice-second", + ], +) +def slice(request): + # For use in testing that we handle + # polars slice pushdown correctly + return request.param + + def make_source(df, path, format): """ Writes the passed polars df to a file of @@ -79,7 +94,9 @@ def make_source(df, path, format): ("parquet", pl.scan_parquet), ], ) -def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, request): +def test_scan( + tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, slice, request +): name, offset = row_index make_source(df, tmp_path / "file", format) request.applymarker( @@ -94,21 +111,23 @@ def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, r row_index_offset=offset, n_rows=n_rows, ) + if slice is not None: + q = q.slice(*slice) if mask is not None: q = q.filter(mask) if columns is not None: q = q.select(*columns) - polars_collect_kwargs = {} - if versions.POLARS_VERSION_LT_12: - # https://github.com/pola-rs/polars/issues/17553 - polars_collect_kwargs = {"projection_pushdown": False} - assert_gpu_result_equal( - q, - polars_collect_kwargs=polars_collect_kwargs, - # This doesn't work in polars < 1.2 since the row-index - # is in the wrong order in previous polars releases - check_column_order=versions.POLARS_VERSION_LT_12, - ) + assert_gpu_result_equal(q) + + +def test_negative_slice_pushdown_raises(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_parquet(tmp_path / "df.parquet") + q = pl.scan_parquet(tmp_path / "df.parquet") + # Take the last row + q = q.slice(-1, 1) + assert_ir_translation_raises(q, NotImplementedError) def test_scan_unsupported_raises(tmp_path): @@ -127,10 +146,6 @@ def test_scan_ndjson_nrows_notimplemented(tmp_path, df): assert_ir_translation_raises(q, NotImplementedError) -@pytest.mark.xfail( - versions.POLARS_VERSION_LT_11, - reason="https://github.com/pola-rs/polars/issues/15730", -) def test_scan_row_index_projected_out(tmp_path): df = pl.DataFrame({"a": [1, 2, 3]}) @@ -169,15 +184,25 @@ def test_scan_csv_column_renames_projection_schema(tmp_path): ("test*.csv", False), ], ) -def test_scan_csv_multi(tmp_path, filename, glob): +@pytest.mark.parametrize( + "nrows_skiprows", + [ + (None, 0), + (1, 1), + (3, 0), + (4, 2), + ], +) +def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows): + n_rows, skiprows = nrows_skiprows with (tmp_path / "test1.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test2.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test*.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") os.chdir(tmp_path) - q = pl.scan_csv(filename, glob=glob) + q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows) assert_gpu_result_equal(q) @@ -280,3 +305,24 @@ def test_scan_ndjson_unsupported(df, tmp_path): make_source(df, tmp_path / "file", "ndjson") q = pl.scan_ndjson(tmp_path / "file", ignore_errors=True) assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_parquet_nested_null_raises(tmp_path): + df = pl.DataFrame({"a": pl.Series([None], dtype=pl.List(pl.Null))}) + + df.write_parquet(tmp_path / "file.pq") + + q = pl.scan_parquet(tmp_path / "file.pq") + + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_parquet_only_row_index_raises(df, tmp_path): + make_source(df, tmp_path / "file", "parquet") + q = pl.scan_parquet(tmp_path / "file", row_index_name="index").select("index") + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_hf_url_raises(): + q = pl.scan_csv("hf://datasets/scikit-learn/iris/Iris.csv") + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py index ecc02efd967..cfa8e5ff9b9 100644 --- a/python/cudf_polars/tests/test_sort.py +++ b/python/cudf_polars/tests/test_sort.py @@ -13,10 +13,7 @@ "sort_keys", [ (pl.col("a"),), - pytest.param( - (pl.col("d").abs(),), - marks=pytest.mark.xfail(reason="abs not yet implemented"), - ), + (pl.col("d").abs(),), (pl.col("a"), pl.col("d")), (pl.col("b"),), ], diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py index 5bc2fe1efb7..8e7f1a09d9b 100644 --- a/python/cudf_polars/tests/testing/test_asserts.py +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -7,7 +7,10 @@ import polars as pl +from cudf_polars.containers import DataFrame +from cudf_polars.dsl.ir import Select from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -26,10 +29,62 @@ def test_translation_assert_raises(): class E(Exception): pass - unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b")) + unsupported = df.group_by("a").agg(pl.col("a").upper_bound().alias("b")) # Unsupported query should raise NotImplementedError assert_ir_translation_raises(unsupported, NotImplementedError) with pytest.raises(AssertionError): # This should fail, because we can't translate this query, but it doesn't raise E. assert_ir_translation_raises(unsupported, E) + + +def test_collect_assert_raises(monkeypatch): + df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + with pytest.raises(AssertionError): + # This should raise, because polars CPU can run this query + assert_collect_raises( + df, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.InvalidOperationError, + ) + + # Here's an invalid query that gets caught at IR optimisation time. + q = df.select(pl.col("a") * pl.col("b")) + + # This exception is raised in preprocessing, so is the same for + # both CPU and GPU engines. + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.InvalidOperationError, + ) + + with pytest.raises(AssertionError): + # This should raise because the expected GPU error is wrong + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=NotImplementedError, + ) + + with pytest.raises(AssertionError): + # This should raise because the expected CPU error is wrong + assert_collect_raises( + q, + polars_except=NotImplementedError, + cudf_except=pl.exceptions.InvalidOperationError, + ) + + with monkeypatch.context() as m: + m.setattr(Select, "evaluate", lambda self, cache: DataFrame([])) + # This query should fail, but we monkeypatch a bad + # implementation of Select which "succeeds" to check that our + # assertion notices this case. + q = df.select(pl.col("a") + pl.Series([1, 2])) + with pytest.raises(AssertionError): + assert_collect_raises( + q, + polars_except=pl.exceptions.ComputeError, + cudf_except=pl.exceptions.ComputeError, + ) diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/custreamz/custreamz/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 5aa474e2862..85ab0024bb5 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -20,8 +20,8 @@ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ "confluent-kafka>=2.5.0,<2.6.0a0", - "cudf==24.10.*,>=0.0.0a0", - "cudf_kafka==24.10.*,>=0.0.0a0", + "cudf==24.12.*,>=0.0.0a0", + "cudf_kafka==24.12.*,>=0.0.0a0", "streamz", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md index 4655d2165f0..69e1524be39 100644 --- a/python/dask_cudf/README.md +++ b/python/dask_cudf/README.md @@ -16,6 +16,7 @@ See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to ## Resources - [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/) +- [Best practices](https://docs.rapids.ai/api/dask-cudf/stable/best_practices/) - [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/) - [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) - [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index 97e1dffc65b..907abaa2bfc 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -15,6 +15,7 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.typing import no_default import cudf @@ -90,6 +91,17 @@ def var( ) ) + def rename_axis( + self, mapper=no_default, index=no_default, columns=no_default, axis=0 + ): + from dask_cudf.expr._expr import RenameAxisCudf + + return new_collection( + RenameAxisCudf( + self, mapper=mapper, index=index, columns=columns, axis=axis + ) + ) + class DataFrame(DXDataFrame, CudfFrameBase): @classmethod @@ -202,27 +214,58 @@ class Index(DXIndex, CudfFrameBase): ## -try: - from dask_expr._backends import create_array_collection - - @get_collection_type.register_lazy("cupy") - def _register_cupy(): - import cupy - - @get_collection_type.register(cupy.ndarray) - def get_collection_type_cupy_array(_): - return create_array_collection - - @get_collection_type.register_lazy("cupyx") - def _register_cupyx(): - # Needed for cuml - from cupyx.scipy.sparse import spmatrix - - @get_collection_type.register(spmatrix) - def get_collection_type_csr_matrix(_): - return create_array_collection - -except ImportError: - # Older version of dask-expr. - # Implicit conversion to array wont work. - pass +def _create_array_collection_with_meta(expr): + # NOTE: This is the GPU compatible version of + # `new_dd_object` for DataFrame -> Array conversion. + # This can be removed if dask#11017 is resolved + # (See: https://github.com/dask/dask/issues/11017) + import numpy as np + + import dask.array as da + from dask.blockwise import Blockwise + from dask.highlevelgraph import HighLevelGraph + + result = expr.optimize() + dsk = result.__dask_graph__() + name = result._name + meta = result._meta + divisions = result.divisions + chunks = ((np.nan,) * (len(divisions) - 1),) + tuple( + (d,) for d in meta.shape[1:] + ) + if len(chunks) > 1: + if isinstance(dsk, HighLevelGraph): + layer = dsk.layers[name] + else: + # dask-expr provides a dict only + layer = dsk + if isinstance(layer, Blockwise): + layer.new_axes["j"] = chunks[1][0] + layer.output_indices = layer.output_indices + ("j",) + else: + suffix = (0,) * (len(chunks) - 1) + for i in range(len(chunks[0])): + layer[(name, i) + suffix] = layer.pop((name, i)) + + return da.Array(dsk, name=name, chunks=chunks, meta=meta) + + +@get_collection_type.register_lazy("cupy") +def _register_cupy(): + import cupy + + get_collection_type.register( + cupy.ndarray, + lambda _: _create_array_collection_with_meta, + ) + + +@get_collection_type.register_lazy("cupyx") +def _register_cupyx(): + # Needed for cuml + from cupyx.scipy.sparse import spmatrix + + get_collection_type.register( + spmatrix, + lambda _: _create_array_collection_with_meta, + ) diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py index 8a2c50d3fe7..b284ab3774d 100644 --- a/python/dask_cudf/dask_cudf/expr/_expr.py +++ b/python/dask_cudf/dask_cudf/expr/_expr.py @@ -4,11 +4,12 @@ import dask_expr._shuffle as _shuffle_module from dask_expr import new_collection from dask_expr._cumulative import CumulativeBlockwise -from dask_expr._expr import Elemwise, Expr, VarColumns +from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty from dask.dataframe.dispatch import is_categorical_dtype +from dask.typing import no_default import cudf @@ -17,6 +18,19 @@ ## +class RenameAxisCudf(RenameAxis): + # TODO: Remove this after rename_axis is supported in cudf + # (See: https://github.com/rapidsai/cudf/issues/16895) + @staticmethod + def operation(df, index=no_default, **kwargs): + if index != no_default: + df.index.name = index + return df + raise NotImplementedError( + "Only `index` is supported for the cudf backend" + ) + + class ToCudfBackend(Elemwise): # TODO: Inherit from ToBackend when rapids-dask-dependency # is pinned to dask>=2024.8.1 diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index e793d4381d1..a781b8242fe 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -23,7 +23,6 @@ from cudf.io import write_to_dataset from cudf.io.parquet import _apply_post_filters, _normalize_filters from cudf.utils.dtypes import cudf_dtype_from_pa_type -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT class CudfEngine(ArrowDatasetEngine): @@ -341,9 +340,7 @@ def write_partition( return_metadata=return_metadata, statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT - ), + row_group_size_bytes=kwargs.get("row_group_size_bytes", None), row_group_size_rows=kwargs.get("row_group_size_rows", None), max_page_size_bytes=kwargs.get("max_page_size_bytes", None), max_page_size_rows=kwargs.get("max_page_size_rows", None), @@ -365,7 +362,7 @@ def write_partition( statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT + "row_group_size_bytes", None ), row_group_size_rows=kwargs.get( "row_group_size_rows", None diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 7aa0f6320f2..5f0fae86691 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -16,6 +16,7 @@ import dask_cudf from dask_cudf.tests.utils import ( + QUERY_PLANNING_ON, require_dask_expr, skip_dask_expr, xfail_dask_expr, @@ -950,12 +951,16 @@ def test_implicit_array_conversion_cupy(): def func(x): return x.values - # Need to compute the dask collection for now. - # See: https://github.com/dask/dask/issues/11017 - result = ds.map_partitions(func, meta=s.values).compute() - expect = func(s) + result = ds.map_partitions(func, meta=s.values) - dask.array.assert_eq(result, expect) + if QUERY_PLANNING_ON: + # Check Array and round-tripped DataFrame + dask.array.assert_eq(result, func(s)) + dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) + else: + # Legacy version still carries numpy metadata + # See: https://github.com/dask/dask/issues/11017 + dask.array.assert_eq(result.compute(), func(s)) def test_implicit_array_conversion_cupy_sparse(): @@ -967,8 +972,6 @@ def test_implicit_array_conversion_cupy_sparse(): def func(x): return cupyx.scipy.sparse.csr_matrix(x.values) - # Need to compute the dask collection for now. - # See: https://github.com/dask/dask/issues/11017 result = ds.map_partitions(func, meta=s.values).compute() expect = func(s) @@ -1024,3 +1027,15 @@ def test_cov_corr(op, numeric_only): # (See: https://github.com/rapidsai/cudf/issues/12626) expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only) dd.assert_eq(res, expect) + + +def test_rename_axis_after_join(): + df1 = cudf.DataFrame(index=["a", "b", "c"], data=dict(a=[1, 2, 3])) + df1.index.name = "test" + ddf1 = dd.from_pandas(df1, 2) + + df2 = cudf.DataFrame(index=["a", "b", "d"], data=dict(b=[1, 2, 3])) + ddf2 = dd.from_pandas(df2, 2) + result = ddf1.join(ddf2, how="outer") + expected = df1.join(df2, how="outer") + dd.assert_eq(result, expected, check_index=False) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 9ac834586a6..c64de06338f 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -19,12 +19,12 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cudf==24.10.*,>=0.0.0a0", + "cudf==24.12.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", "pandas>=2.0,<2.2.3dev0", - "rapids-dask-dependency==24.10.*,>=0.0.0a0", + "rapids-dask-dependency==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -45,7 +45,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ - "dask-cuda==24.10.*,>=0.0.0a0", + "dask-cuda==24.12.*,>=0.0.0a0", "numba>=0.57", "pytest-cov", "pytest-xdist", diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 2c98b97eddf..5bffe9fd96c 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -66,7 +66,7 @@ dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", - "libkvikio==24.10.*,>=0.0.0a0", - "librmm==24.10.*,>=0.0.0a0", + "libkvikio==24.12.*,>=0.0.0a0", + "librmm==24.12.*,>=0.0.0a0", "ninja", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 0ddc68bcb9d..e8e0caaf42d 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -2,7 +2,19 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.datetime cimport extract_year as cpp_extract_year +from pylibcudf.libcudf.datetime cimport ( + day_of_year as cpp_day_of_year, + extract_day as cpp_extract_day, + extract_hour as cpp_extract_hour, + extract_microsecond_fraction as cpp_extract_microsecond_fraction, + extract_millisecond_fraction as cpp_extract_millisecond_fraction, + extract_minute as cpp_extract_minute, + extract_month as cpp_extract_month, + extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, + extract_second as cpp_extract_second, + extract_weekday as cpp_extract_weekday, + extract_year as cpp_extract_year, +) from .column cimport Column @@ -28,3 +40,42 @@ cpdef Column extract_year( with nogil: result = move(cpp_extract_year(values.view())) return Column.from_libcudf(move(result)) + + +def extract_datetime_component(Column col, str field): + + cdef unique_ptr[column] c_result + + with nogil: + if field == "year": + c_result = move(cpp_extract_year(col.view())) + elif field == "month": + c_result = move(cpp_extract_month(col.view())) + elif field == "day": + c_result = move(cpp_extract_day(col.view())) + elif field == "weekday": + c_result = move(cpp_extract_weekday(col.view())) + elif field == "hour": + c_result = move(cpp_extract_hour(col.view())) + elif field == "minute": + c_result = move(cpp_extract_minute(col.view())) + elif field == "second": + c_result = move(cpp_extract_second(col.view())) + elif field == "millisecond": + c_result = move( + cpp_extract_millisecond_fraction(col.view()) + ) + elif field == "microsecond": + c_result = move( + cpp_extract_microsecond_fraction(col.view()) + ) + elif field == "nanosecond": + c_result = move( + cpp_extract_nanosecond_fraction(col.view()) + ) + elif field == "day_of_year": + c_result = move(cpp_day_of_year(col.view())) + else: + raise ValueError(f"Invalid datetime field: '{field}'") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 0c352a5068b..f2dd22f43aa 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const column_view &seeds, const size_type width, ) except + + + cdef unique_ptr[column] word_minhash( + const column_view &input, + const column_view &seeds + ) except + + + cdef unique_ptr[column] word_minhash64( + const column_view &input, + const column_view &seeds + ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index bd6e2e0af02..abf4357f862 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx) +set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd index c2fb5f0dce4..eac0f748257 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd @@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] like( column_view source_strings, string_scalar pattern, - string_scalar escape) except + + string_scalar escape_character) except + + + cdef unique_ptr[column] like( + column_view source_strings, + column_view patterns, + string_scalar escape_character) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd index 12cd628fc1f..b7166167cfd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd @@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[table] extract( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + + + cdef unique_ptr[column] extract_all_record( + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd index b25724586e1..e0a8b776465 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd @@ -9,5 +9,5 @@ from pylibcudf.libcudf.strings.regex_program cimport regex_program cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] findall( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd index 410ff58f299..59262820411 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd @@ -10,9 +10,9 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] repeat_strings( - column_view strings, - size_type repeat) except + + column_view input, + size_type repeat_times) except + cdef unique_ptr[column] repeat_strings( - column_view strings, - column_view repeats) except + + column_view input, + column_view repeat_times) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd index 3a89299f11a..019ff3f17ba 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd @@ -1,10 +1,10 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil: - ctypedef enum side_type: + cpdef enum class side_type(int32_t): LEFT 'cudf::strings::side_type::LEFT' RIGHT 'cudf::strings::side_type::RIGHT' BOTH 'cudf::strings::side_type::BOTH' diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index b499a127541..77f20b0b917 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx slice.pyx +set(cython_sources + capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx + regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx ) set(linked_libraries cudf::cudf) @@ -22,3 +23,5 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf ) + +add_subdirectory(convert) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d1f632d6d8e..91d884b294b 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,9 +5,14 @@ from . cimport ( case, char_types, contains, + convert, + extract, find, + findall, regex_flags, regex_program, replace, slice, + strip, ) +from .side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index ef102aff2af..b4856784390 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,9 +5,15 @@ case, char_types, contains, + convert, + extract, find, + findall, regex_flags, regex_program, + repeat, replace, slice, + strip, ) +from .side_type import SideType diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd index 2cd4891a0ea..6146a1119d6 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/strings/contains.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_program cimport RegexProgram +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Column contains_re(Column input, RegexProgram prog) + +cpdef Column count_re(Column input, RegexProgram prog) + +cpdef Column matches_re(Column input, RegexProgram prog) + +cpdef Column like( + Column input, + ColumnOrScalar pattern, + Scalar escape_character = * +) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 1a2446f6e2c..82bd1fbea32 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -1,8 +1,14 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cython.operator import dereference + from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram @@ -32,9 +38,131 @@ cpdef Column contains_re( cdef unique_ptr[column] result with nogil: - result = cpp_contains.contains_re( + result = move(cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column count_re( + Column input, + RegexProgram prog +): + """Returns the number of times the given regex_program's pattern + matches in each string. + + For details, see :cpp:func:`cudf::strings::count_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of match counts for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.count_re( input.view(), prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column matches_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which + matching the given regex_program object but only at + the beginning the string. + + For details, see :cpp:func:`cudf::strings::matches_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.matches_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None): + """ + Returns a boolean column identifying rows which + match the given like pattern. + + For details, see :cpp:func:`cudf::strings::like`. + + Parameters + ---------- + input : Column + The input strings + pattern : Column or Scalar + Like patterns to match within each string + escape_character : Scalar + Optional character specifies the escape prefix. + Default is no escape character. + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + cdef unique_ptr[column] result + + if escape_character is None: + escape_character = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) ) + cdef const string_scalar* c_escape_character = ( + escape_character.c_obj.get() + ) + cdef const string_scalar* c_pattern + + if ColumnOrScalar is Column: + with nogil: + result = move(cpp_contains.like( + input.view(), + pattern.view(), + dereference(c_escape_character) + )) + elif ColumnOrScalar is Scalar: + c_pattern = (pattern.c_obj.get()) + with nogil: + result = move(cpp_contains.like( + input.view(), + dereference(c_pattern), + dereference(c_escape_character) + )) + else: + raise ValueError("pattern must be a Column or a Scalar") + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt new file mode 100644 index 00000000000..175c9b3738e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources convert_durations.pyx convert_datetime.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd new file mode 100644 index 00000000000..05324cb49df --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . cimport convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py new file mode 100644 index 00000000000..d803399d53c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . import convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd new file mode 100644 index 00000000000..07c84d263d6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_timestamps( + Column input, + DataType timestamp_type, + const string& format +) + +cpdef Column from_timestamps( + Column input, + const string& format, + Column input_strings_names +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx new file mode 100644 index 00000000000..fcacb096f87 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -0,0 +1,56 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_datetime as cpp_convert_datetime, +) + +from pylibcudf.types import DataType + + +cpdef Column to_timestamps( + Column input, + DataType timestamp_type, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.to_timestamps( + input.view(), + timestamp_type.c_obj, + format + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_timestamps( + Column input, + const string& format, + Column input_strings_names +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.from_timestamps( + input.view(), + format, + input_strings_names.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_timestamp( + Column input, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.is_timestamp( + input.view(), + format + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd new file mode 100644 index 00000000000..ac11b8959ed --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_durations( + Column input, + DataType duration_type, + const string& format +) + +cpdef Column from_durations( + Column input, + const string& format +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx new file mode 100644 index 00000000000..f3e0b7c9c8e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_durations as cpp_convert_durations, +) + +from pylibcudf.types import DataType + + +cpdef Column to_durations( + Column input, + DataType duration_type, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_durations.to_durations( + input.view(), + duration_type.c_obj, + format + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_durations( + Column input, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_durations.from_durations( + input.view(), + format + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd new file mode 100644 index 00000000000..3871f5a0e4e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog) + +cpdef Column extract_all_record(Column input, RegexProgram prog) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx new file mode 100644 index 00000000000..dcb11ca10ce --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport extract as cpp_extract +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog): + """ + Returns a table of strings columns where each column + corresponds to the matching group specified in the given + egex_program object. + + For details, see :cpp:func:`cudf::strings::extract`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Table + Columns of strings extracted from the input column. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_extract.extract( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Column extract_all_record(Column input, RegexProgram prog): + """ + Returns a lists column of strings where each string column + row corresponds to the matching group specified in the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::extract_all_record`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Column + Lists column containing strings extracted from the input column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_extract.extract_all_record( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd new file mode 100644 index 00000000000..54afa088141 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column findall(Column input, RegexProgram pattern) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx new file mode 100644 index 00000000000..03ecb13a50e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport findall as cpp_findall +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column findall(Column input, RegexProgram pattern): + """ + Returns a lists column of strings for each matching occurrence using + the regex_program pattern within each string. + + For details, see For details, see :cpp:func:`cudf::strings::findall`. + + Parameters + ---------- + input : Column + Strings instance for this operation + pattern : RegexProgram + Regex pattern + + Returns + ------- + Column + New lists column of strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_findall.findall( + input.view(), + pattern.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd new file mode 100644 index 00000000000..bc70926b6fa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + +ctypedef fused ColumnorSizeType: + Column + size_type + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx new file mode 100644 index 00000000000..5f627218f6e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport repeat as cpp_repeat +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): + """ + Repeat each string in the given strings column by the numbers + of times given in another numeric column. + + For details, see :cpp:func:`cudf::strings::repeat`. + + Parameters + ---------- + input : Column + The column containing strings to repeat. + repeat_times : Column or int + Number(s) of times that the corresponding input strings + for each row are repeated. + + Returns + ------- + Column + New column containing the repeated strings. + """ + cdef unique_ptr[column] c_result + + if ColumnorSizeType is Column: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times.view() + ) + ) + elif ColumnorSizeType is size_type: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times + ) + ) + else: + raise ValueError("repeat_times must be size_type or integer") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd new file mode 100644 index 00000000000..34b7a580380 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.strings.side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx new file mode 100644 index 00000000000..acdc7d6ff1f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.strings.side_type import \ + side_type as SideType # no-cython-lint diff --git a/python/pylibcudf/pylibcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/strings/strip.pxd new file mode 100644 index 00000000000..8bbe4753edd --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.side_type cimport side_type + + +cpdef Column strip( + Column input, + side_type side=*, + Scalar to_strip=* +) diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx new file mode 100644 index 00000000000..429a23c3cdf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport strip as cpp_strip +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.side_type cimport side_type + + +cpdef Column strip( + Column input, + side_type side=side_type.BOTH, + Scalar to_strip=None +): + """Removes the specified characters from the beginning + or end (or both) of each string. + + For details, see :cpp:func:`cudf::strings::strip`. + + Parameters + ---------- + input : Column + Strings column for this operation + side : SideType, default SideType.BOTH + Indicates characters are to be stripped from the beginning, + end, or both of each string; Default is both + to_strip : Scalar + UTF-8 encoded characters to strip from each string; + Default is empty string which indicates strip whitespace characters + + Returns + ------- + pylibcudf.Column + New strings column. + """ + + if to_strip is None: + to_strip = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef unique_ptr[column] c_result + cdef string_scalar* cpp_to_strip + cpp_to_strip = (to_strip.c_obj.get()) + + with nogil: + c_result = cpp_strip.strip( + input.view(), + side, + dereference(cpp_to_strip) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini index 1761c0f011c..f572f85ca49 100644 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/tests/pytest.ini @@ -6,3 +6,4 @@ filterwarnings = error ignore:::.*xdist.* ignore:::.*pytest.* +addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index d3aa6101e2d..89c96829e71 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import datetime +import functools import pyarrow as pa import pyarrow.compute as pc @@ -10,7 +11,7 @@ @pytest.fixture -def column(has_nulls): +def date_column(has_nulls): values = [ datetime.date(1999, 1, 1), datetime.date(2024, 10, 12), @@ -22,9 +23,41 @@ def column(has_nulls): return plc.interop.from_arrow(pa.array(values, type=pa.date32())) -def test_extract_year(column): - got = plc.datetime.extract_year(column) +@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) +def datetime_column(has_nulls, request): + values = [ + datetime.datetime(1999, 1, 1), + datetime.datetime(2024, 10, 12), + datetime.datetime(1970, 1, 1), + datetime.datetime(2260, 1, 1), + datetime.datetime(2024, 2, 29, 3, 14, 15), + datetime.datetime(2024, 2, 29, 3, 14, 15, 999), + ] + if has_nulls: + values[2] = None + return plc.interop.from_arrow( + pa.array(values, type=pa.timestamp(request.param)) + ) + + +@pytest.mark.parametrize( + "component, pc_fun", + [ + ("year", pc.year), + ("month", pc.month), + ("day", pc.day), + ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)), + ("hour", pc.hour), + ("minute", pc.minute), + ("second", pc.second), + ("millisecond", pc.millisecond), + ("microsecond", pc.microsecond), + ("nanosecond", pc.nanosecond), + ], +) +def test_extraction(datetime_column, component, pc_fun): + got = plc.datetime.extract_datetime_component(datetime_column, component) # libcudf produces an int16, arrow produces an int64 - expect = pc.year(plc.interop.to_arrow(column)).cast(pa.int16()) + expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16()) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4f88e09183f..4e4dd7cbb00 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat): pa_target_col, pa_target_scalar.as_py() ) assert_column_eq(got, expected) + + +def test_count_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["A1a2A3a4", "A1A2A3", None]) + result = plc.strings.contains.count_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.count_substring_regex(arr, pattern) + assert_column_eq(result, expected) + + +def test_match_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["1a2b", "b1a2", None]) + result = plc.strings.contains.matches_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.match_substring_regex(arr, f"^{pattern}") + assert_column_eq(result, expected) + + +def test_like(): + pattern = "%a" + arr = pa.array(["1a2aa3aaa"]) + result = plc.strings.contains.like( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(pa.array([pattern])), + ) + expected = pc.match_like(arr, pattern) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py new file mode 100644 index 00000000000..e9e95459d0e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture( + scope="module", + params=[ + pa.timestamp("ns"), + pa.timestamp("us"), + pa.timestamp("ms"), + pa.timestamp("s"), + ], +) +def timestamp_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_timestamp_col(): + return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_timestamp_col(pa_timestamp_col): + return plc.interop.from_arrow(pa_timestamp_col) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +@pytest.mark.parametrize("format", ["%Y-%m-%d"]) +def test_to_datetime( + pa_timestamp_col, plc_timestamp_col, timestamp_type, format +): + expect = pa.compute.strptime(pa_timestamp_col, format, timestamp_type.unit) + got = plc.strings.convert.convert_datetime.to_timestamps( + plc_timestamp_col, + plc.interop.from_arrow(timestamp_type), + format.encode(), + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", ["%H:%M:%S"]) +def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format.encode(), + ) + assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py new file mode 100644 index 00000000000..788b86423c4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + + +def test_extract(): + pattern = "([ab])(\\d)" + pa_pattern = "(?P[ab])(?P\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pc.extract_regex(arr, pa_pattern) + for i, result_col in enumerate(result.itercolumns()): + expected_col = pa.chunked_array(expected.field(i)) + assert result_col.fill_null("").equals(expected_col) + + +def test_extract_all_record(): + pattern = "([ab])(\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract_all_record( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array( + [pa.array([["a", "1"], ["b", "2"], None], type=result.type)] + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py new file mode 100644 index 00000000000..994552fa276 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import re + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_findall(): + arr = pa.array(["bunny", "rabbit", "hare", "dog"]) + pattern = "[ab]" + result = plc.strings.findall.findall( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + pa_result = plc.interop.to_arrow(result) + expected = pa.array( + [re.findall(pattern, elem) for elem in arr.to_pylist()], + type=pa_result.type, + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py new file mode 100644 index 00000000000..18b5d8bf4d0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest + + +@pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) +def test_repeat_strings(repeats): + arr = pa.array(["1", None]) + plc_result = plc.strings.repeat.repeat_strings( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(repeats) + if not isinstance(repeats, int) + else repeats, + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.binary_repeat(arr, repeats)) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py new file mode 100644 index 00000000000..005e5e4a405 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + +data_strings = [ + "AbC", + "123abc", + "", + " ", + None, + "aAaaaAAaa", + " ab c ", + "abc123", + " ", + "\tabc\t", + "\nabc\n", + "\r\nabc\r\n", + "\t\n abc \n\t", + "!@#$%^&*()", + " abc!!! ", + " abc\t\n!!! ", + "__abc__", + "abc\n\n", + "123abc456", + "abcxyzabc", +] + +strip_chars = [ + "a", + "", + " ", + "\t", + "\n", + "\r\n", + "!", + "@#", + "123", + "xyz", + "abc", + "__", + " \t\n", + "abc123", +] + + +@pytest.fixture +def pa_col(): + return pa.array(data_strings, type=pa.string()) + + +@pytest.fixture +def plc_col(pa_col): + return plc.interop.from_arrow(pa_col) + + +@pytest.fixture(params=strip_chars) +def pa_char(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture +def plc_char(pa_char): + return plc.interop.from_arrow(pa_char) + + +def test_strip(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.strip() + return st.strip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip(plc_col, plc.strings.SideType.BOTH, plc_char) + assert_column_eq(expected, got) + + +def test_strip_right(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.rstrip() + return st.rstrip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip( + plc_col, plc.strings.SideType.RIGHT, plc_char + ) + assert_column_eq(expected, got) + + +def test_strip_left(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.lstrip() + return st.lstrip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip(plc_col, plc.strings.SideType.LEFT, plc_char) + assert_column_eq(expected, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py index 06fc35d8835..d5c618f07e4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/pylibcudf/tests/test_transform.py @@ -29,3 +29,54 @@ def test_nans_to_nulls(has_nans): got = input.with_mask(mask, null_count) assert_column_eq(expect, got) + + +def test_bools_to_mask_roundtrip(): + pa_array = pa.array([True, None, False]) + plc_input = plc.interop.from_arrow(pa_array) + mask, result_null_count = plc.transform.bools_to_mask(plc_input) + + assert result_null_count == 2 + result = plc_input.with_mask(mask, result_null_count) + assert_column_eq(pa.array([True, None, None]), result) + + plc_output = plc.transform.mask_to_bools(mask.ptr, 0, len(pa_array)) + result_pa = plc.interop.to_arrow(plc_output) + expected_pa = pa.chunked_array([[True, False, False]]) + assert result_pa.equals(expected_pa) + + +def test_encode(): + pa_table = pa.table({"a": [1, 3, 4], "b": [1, 2, 4]}) + plc_input = plc.interop.from_arrow(pa_table) + result_table, result_column = plc.transform.encode(plc_input) + pa_table_result = plc.interop.to_arrow(result_table) + pa_column_result = plc.interop.to_arrow(result_column) + + pa_table_expected = pa.table( + [[1, 3, 4], [1, 2, 4]], + schema=pa.schema( + [ + pa.field("", pa.int64(), nullable=False), + pa.field("", pa.int64(), nullable=False), + ] + ), + ) + assert pa_table_result.equals(pa_table_expected) + + pa_column_expected = pa.chunked_array([[0, 1, 2]], type=pa.int32()) + assert pa_column_result.equals(pa_column_expected) + + +def test_one_hot_encode(): + pa_column = pa.array([1, 2, 3]) + pa_categories = pa.array([0, 0, 0]) + plc_input = plc.interop.from_arrow(pa_column) + plc_categories = plc.interop.from_arrow(pa_categories) + plc_table = plc.transform.one_hot_encode(plc_input, plc_categories) + result = plc.interop.to_arrow(plc_table) + expected = pa.table( + [[False] * 3] * 3, + schema=pa.schema([pa.field("", pa.bool_(), nullable=False)] * 3), + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd index 4b21feffe25..b530f433c97 100644 --- a/python/pylibcudf/pylibcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/transform.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool +from pylibcudf.libcudf.types cimport bitmask_type, data_type from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .table cimport Table +from .types cimport DataType cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) + +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input) + +cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit) + +cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx) + +cpdef tuple[Table, Column] encode(Table input) + +cpdef Table one_hot_encode(Column input_column, Column categories) diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index 100ccb580ce..bcd6185521a 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -1,14 +1,20 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from libcpp.utility cimport move, pair from pylibcudf.libcudf cimport transform as cpp_transform -from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.table.table_view cimport table_view +from pylibcudf.libcudf.types cimport bitmask_type, size_type from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .types cimport DataType +from .utils cimport int_to_bitmask_ptr cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): @@ -32,3 +38,141 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), c_result.second ) + + +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): + """Create a bitmask from a column of boolean elements + + Parameters + ---------- + input : Column + Column to produce new mask from. + + Returns + ------- + tuple[gpumemoryview, int] + Two-tuple of a gpumemoryview wrapping the bitmask and the null count. + """ + cdef pair[unique_ptr[device_buffer], size_type] c_result + + with nogil: + c_result = move(cpp_transform.bools_to_mask(input.view())) + + return ( + gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), + c_result.second + ) + + +cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): + """Creates a boolean column from given bitmask. + + Parameters + ---------- + bitmask : int + Pointer to the bitmask which needs to be converted + begin_bit : int + Position of the bit from which the conversion should start + end_bit : int + Position of the bit before which the conversion should stop + + Returns + ------- + Column + Boolean column of the bitmask from [begin_bit, end_bit] + """ + cdef unique_ptr[column] c_result + cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask) + + with nogil: + c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx): + """Create a new column by applying a unary function against every + element of an input column. + + Parameters + ---------- + input : Column + Column to transform. + unary_udf : str + The PTX/CUDA string of the unary function to apply. + output_type : DataType + The output type that is compatible with the output type in the unary_udf. + is_ptx : bool + If `True`, the UDF is treated as PTX code. + If `False`, the UDF is treated as CUDA code. + + Returns + ------- + Column + The transformed column having the UDF applied to each element. + """ + cdef unique_ptr[column] c_result + cdef string c_unary_udf = unary_udf.encode() + cdef bool c_is_ptx = is_ptx + + with nogil: + c_result = move( + cpp_transform.transform( + input.view(), c_unary_udf, output_type.c_obj, c_is_ptx + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef tuple[Table, Column] encode(Table input): + """Encode the rows of the given table as integers. + + Parameters + ---------- + input : Table + Table containing values to be encoded + + Returns + ------- + tuple[Table, Column] + The distinct row of the input table in sorted order, + and a column of integer indices representing the encoded rows. + """ + cdef pair[unique_ptr[table], unique_ptr[column]] c_result + + with nogil: + c_result = move(cpp_transform.encode(input.view())) + + return ( + Table.from_libcudf(move(c_result.first)), + Column.from_libcudf(move(c_result.second)) + ) + +cpdef Table one_hot_encode(Column input, Column categories): + """Encodes `input` by generating a new column + for each value in `categories` indicating the presence + of that value in `input`. + + Parameters + ---------- + input : Column + Column containing values to be encoded. + categories : Column + Column containing categories + + Returns + ------- + Column + A table of the encoded values. + """ + cdef pair[unique_ptr[column], table_view] c_result + cdef Table owner_table + + with nogil: + c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view())) + + owner_table = Table( + [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() + ) + + return Table.from_table_view(c_result.second, owner_table) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 3aaca09d8bd..a8224f54e1c 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -19,11 +19,11 @@ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ "cuda-python>=11.7.1,<12.0a0", - "libcudf==24.10.*,>=0.0.0a0", + "libcudf==24.12.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", "pyarrow>=14.0.0,<18.0.0a0", - "rmm==24.10.*,>=0.0.0a0", + "rmm==24.12.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -102,10 +102,10 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", - "libcudf==24.10.*,>=0.0.0a0", - "librmm==24.10.*,>=0.0.0a0", + "libcudf==24.12.*,>=0.0.0a0", + "librmm==24.12.*,>=0.0.0a0", "ninja", - "rmm==24.10.*,>=0.0.0a0", + "rmm==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [tool.scikit-build]