diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 8190b5d0297..315a389339a 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1" ENV SCCACHE_REGION="us-east-2" ENV SCCACHE_BUCKET="rapids-sccache-devs" -ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai" +ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" ENV HISTFILE="/home/coder/.cache/._bash_history" ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache" diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 7a1361e52c5..d86fc0e550a 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 64d7cd54130..66a3b22df11 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json index c1924243506..a0e193ff0bf 100644 --- a/.devcontainer/cuda12.5-conda/devcontainer.json +++ b/.devcontainer/cuda12.5-conda/devcontainer.json @@ -5,19 +5,41 @@ "args": { "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": { + "version": "12.5", + "installCompilers": false, + "installProfilers": true, + "installDevPackages": false, + "installcuDNN": false, + "installcuTensor": false, + "installNCCL": false, + "installCUDARuntime": false, + "installNVRTC": false, + "installOpenCL": false, + "installcuBLAS": false, + "installcuSPARSE": false, + "installcuFFT": false, + "installcuFile": false, + "installcuRAND": false, + "installcuSOLVER": false, + "installNPP": false, + "installnvJPEG": false, + "pruneStaticLibs": true + }, + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"], diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json index beab2940176..125c85cefa9 100644 --- a/.devcontainer/cuda12.5-pip/devcontainer.json +++ b/.devcontainer/cuda12.5-pip/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml new file mode 100644 index 00000000000..1bf4ac08b69 --- /dev/null +++ b/.github/workflows/auto-assign.yml @@ -0,0 +1,17 @@ +name: "Auto Assign PR" + +on: + pull_request_target: + types: + - opened + - reopened + - synchronize + +jobs: + add_assignees: + runs-on: ubuntu-latest + steps: + - uses: actions-ecosystem/action-add-assignees@v1 + with: + github_token: "${{ secrets.GITHUB_TOKEN }}" + assignees: ${{ github.actor }} diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 43fc567cb67..c034752d373 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-libcudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -81,7 +81,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -92,7 +92,7 @@ jobs: wheel-build-pylibcudf: needs: [wheel-publish-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -102,7 +102,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -113,7 +113,7 @@ jobs: wheel-build-cudf: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -134,7 +134,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -157,7 +157,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -169,7 +169,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 31e78f82a62..f5cb71bfc14 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -1,4 +1,5 @@ name: "Pull Request Labeler" + on: - pull_request_target diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index 10c803f7921..c676032779f 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,7 +17,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 766df59594b..a8f5023ef76 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -10,6 +10,7 @@ concurrency: cancel-in-progress: true jobs: + # Please keep pr-builder as the top job here pr-builder: needs: - changed-files @@ -37,122 +38,106 @@ jobs: - unit-tests-cudf-pandas - pandas-tests - pandas-tests-diff + - telemetry-setup secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 if: always() with: needs: ${{ toJSON(needs) }} - changed-files: + telemetry-setup: + continue-on-error: true runs-on: ubuntu-latest - name: "Check changed files" - outputs: - test_cpp: ${{ steps.changed-files.outputs.cpp_any_changed == 'true' }} - test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }} - test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }} - test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }} - test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }} + env: + OTEL_SERVICE_NAME: 'pr-cudf' steps: - - name: Get PR info - id: get-pr-info - uses: nv-gha-runners/get-pr-info@main - - name: Checkout code repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 - persist-credentials: false - - name: Calculate merge base - id: calculate-merge-base - env: - PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }} - BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }} - run: | - (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") > "$GITHUB_OUTPUT" - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v45 - with: - base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }} - sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }} - files_yaml: | - cpp: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' - - '!python/**' - - '!ci/cudf_pandas_scripts/**' - java: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!notebooks/**' - - '!python/**' - - '!ci/cudf_pandas_scripts/**' - notebooks: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!java/**' - - '!ci/cudf_pandas_scripts/**' - python: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' - - '!ci/cudf_pandas_scripts/**' - cudf_pandas: - - '**' - - 'ci/cudf_pandas_scripts/**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' + - name: Telemetry setup + uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main + changed-files: + secrets: inherit + needs: telemetry-setup + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12 + with: + files_yaml: | + test_cpp: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + - '!python/**' + test_cudf_pandas: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + test_java: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!notebooks/**' + - '!python/**' + test_notebooks: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!java/**' + test_python: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 + needs: telemetry-setup + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 with: enable_check_generated_files: false + ignored_pr_jobs: "telemetry-summarize" conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 - if: needs.changed-files.outputs.test_cpp == 'true' + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: pull-request conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: "ci/test_python_cudf.sh" @@ -160,16 +145,16 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: "ci/test_python_other.sh" conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 - if: needs.changed-files.outputs.test_java == 'true' + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -179,7 +164,7 @@ jobs: static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -189,8 +174,8 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 - if: needs.changed-files.outputs.test_notebooks == 'true' + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -200,7 +185,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -210,7 +195,7 @@ jobs: wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -219,29 +204,29 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -250,30 +235,26 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request - # This always runs, but only fails if this PR touches code in - # pylibcudf or cudf_polars script: "ci/test_wheel_cudf_polars.sh" cudf-polars-polars-tests: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request - # This always runs, but only fails if this PR touches code in - # pylibcudf or cudf_polars script: "ci/test_cudf_polars_polars_tests.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -282,8 +263,8 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -291,7 +272,8 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12 + needs: telemetry-setup with: arch: '["amd64"]' cuda: '["12.5"]' @@ -302,8 +284,8 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -313,8 +295,8 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -325,8 +307,23 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: node_type: cpu4 build_type: pull-request run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" + + telemetry-summarize: + runs-on: ubuntu-latest + needs: pr-builder + if: always() + continue-on-error: true + steps: + - name: Load stashed telemetry env vars + uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main + with: + load_service_name: true + - name: Telemetry summarize + uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main + with: + cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}" diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 45e5191eb54..6f0e88fb245 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.12 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.12 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -62,3 +62,33 @@ jobs: UPDATE_ITEM: true UPDATE_LINKED_ISSUES: true secrets: inherit + + process-branch-name: + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: get-project-id + runs-on: ubuntu-latest + outputs: + branch-name: ${{ steps.process-branch-name.outputs.branch-name }} + steps: + - name: Extract branch name + id: process-branch-name + run: | + branch=${{ github.event.pull_request.base.ref }} + release=${branch#branch-} + echo "branch-name=$release" >> "$GITHUB_OUTPUT" + + update-release: + # This job sets the PR and its linked issues to the release they are targeting + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12 + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: [get-project-id, process-branch-name] + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ" + SINGLE_SELECT_FIELD_NAME: "Release" + SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index eb15c2931d2..d261c370fd0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -45,16 +45,29 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: - build_type: pull-request + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} # Use the wheel container so we can skip conda solves and since our # primary static consumers (Spark) are not in conda anyway. container_image: "rapidsai/ci-wheel:latest" run_script: "ci/configure_cpp_static.sh" + cpp-linters: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + run_script: "ci/cpp_linters.sh" + file_to_upload: iwyu_results.txt conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +77,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +86,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -85,7 +98,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -97,7 +110,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,10 +119,8 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: - # This selects "ARCH=amd64 + the latest supported Python + CUDA". - matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} @@ -117,22 +128,28 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} script: ci/cudf_pandas_scripts/run_tests.sh - third-party-integration-tests-cudf-pandas: + wheel-tests-cudf-polars: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - node_type: "gpu-v100-latest-1" - container_image: "rapidsai/ci-conda:latest" - run_script: | - ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml + script: "ci/test_wheel_cudf_polars.sh" + cudf-polars-polars-tests: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + script: "ci/test_cudf_polars_polars_tests.sh" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 56469ce011e..4206e0626c0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,17 +16,6 @@ repos: ^cpp/cmake/thirdparty/patches/.*| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - - repo: https://github.com/PyCQA/isort - rev: 5.13.2 - hooks: - - id: isort - # Use the config file specific to each subproject so that each - # project can specify its own first/third-party packages. - args: ["--config-root=python/", "--resolve-all-configs"] - files: python/.* - exclude: | - (?x)^(^python/cudf_polars/.*) - types_or: [python, cython, pyi] - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.16.2 hooks: @@ -95,6 +84,16 @@ repos: entry: 'pytest\.xfail' language: pygrep types: [python] + - id: no-unseeded-default-rng + name: no-unseeded-default-rng + description: 'Enforce that no non-seeded default_rng is used and default_rng is used instead of np.random.seed' + entry: | + # Check for usage of default_rng without seeding + default_rng\(\)| + # Check for usage of np.random.seed (NPY002 only disallows this being called) + np.random.seed + language: pygrep + types: [python] - id: cmake-format name: cmake-format entry: ./cpp/scripts/run-cmake-format.sh cmake-format @@ -106,6 +105,10 @@ repos: - cmakelang==0.6.13 verbose: true require_serial: true + exclude: | + (?x)^( + cpp/cmake/Modules/FindCUDAToolkit[.]cmake$ + ) - id: cmake-lint name: cmake-lint entry: ./cpp/scripts/run-cmake-format.sh cmake-lint @@ -117,6 +120,10 @@ repos: - cmakelang==0.6.13 verbose: true require_serial: true + exclude: | + (?x)^( + cpp/cmake/Modules/FindCUDAToolkit[.]cmake$ + ) - id: doxygen-check name: doxygen-check entry: ./ci/checks/doxygen.sh @@ -140,6 +147,7 @@ repos: rev: v0.4.8 hooks: - id: ruff + args: ["--fix"] files: python/.*$ - id: ruff-format files: python/.*$ @@ -151,12 +159,13 @@ repos: (?x)^( cpp/include/cudf_test/cxxopts[.]hpp$| cpp/src/io/parquet/ipc/Message_generated[.]h$| - cpp/src/io/parquet/ipc/Schema_generated[.]h$ + cpp/src/io/parquet/ipc/Schema_generated[.]h$| + cpp/cmake/Modules/FindCUDAToolkit[.]cmake$ ) - id: verify-alpha-spec args: ["--fix", "--mode=release"] - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.13.11 + rev: v1.16.0 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/CHANGELOG.md b/CHANGELOG.md index b9d37c25dad..72d8f80208d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,332 @@ +# cudf 24.12.00 (11 Dec 2024) + +## 🚨 Breaking Changes + +- Fix reading Parquet string cols when `nrows` and `input_pass_limit` > 0 ([#17321](https://github.com/rapidsai/cudf/pull/17321)) [@mhaseeb123](https://github.com/mhaseeb123) +- prefer wheel-provided libcudf.so in load_library(), use RTLD_LOCAL ([#17316](https://github.com/rapidsai/cudf/pull/17316)) [@jameslamb](https://github.com/jameslamb) +- Deprecate single component extraction methods in libcudf ([#17221](https://github.com/rapidsai/cudf/pull/17221)) [@Matt711](https://github.com/Matt711) +- Move detail header floating_conversion.hpp to detail subdirectory ([#17209](https://github.com/rapidsai/cudf/pull/17209)) [@davidwendt](https://github.com/davidwendt) +- Refactor Dask cuDF legacy code ([#17205](https://github.com/rapidsai/cudf/pull/17205)) [@rjzamora](https://github.com/rjzamora) +- Make HostMemoryBuffer call into the DefaultHostMemoryAllocator ([#17204](https://github.com/rapidsai/cudf/pull/17204)) [@revans2](https://github.com/revans2) +- Remove java reservation ([#17189](https://github.com/rapidsai/cudf/pull/17189)) [@revans2](https://github.com/revans2) +- Separate evaluation logic from `IR` objects in cudf-polars ([#17175](https://github.com/rapidsai/cudf/pull/17175)) [@rjzamora](https://github.com/rjzamora) +- Upgrade to polars 1.11 in cudf-polars ([#17154](https://github.com/rapidsai/cudf/pull/17154)) [@wence-](https://github.com/wence-) +- Remove the additional host register calls initially intended for performance improvement on Grace Hopper ([#17092](https://github.com/rapidsai/cudf/pull/17092)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Correctly set `is_device_accesible` when creating `host_span`s from other container/span types ([#17079](https://github.com/rapidsai/cudf/pull/17079)) [@vuule](https://github.com/vuule) +- Unify treatment of `Expr` and `IR` nodes in cudf-polars DSL ([#17016](https://github.com/rapidsai/cudf/pull/17016)) [@wence-](https://github.com/wence-) +- Deprecate support for directly accessing logger ([#16964](https://github.com/rapidsai/cudf/pull/16964)) [@vyasr](https://github.com/vyasr) +- Made cudftestutil header-only and removed GTest dependency ([#16839](https://github.com/rapidsai/cudf/pull/16839)) [@lamarrr](https://github.com/lamarrr) + +## 🐛 Bug Fixes + +- Turn off cudf.pandas 3rd party integrations tests for 24.12 ([#17500](https://github.com/rapidsai/cudf/pull/17500)) [@Matt711](https://github.com/Matt711) +- Ignore errors when testing glibc versions ([#17389](https://github.com/rapidsai/cudf/pull/17389)) [@vyasr](https://github.com/vyasr) +- Adapt to KvikIO API change in the compatibility mode ([#17377](https://github.com/rapidsai/cudf/pull/17377)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Support pivot with index or column arguments as lists ([#17373](https://github.com/rapidsai/cudf/pull/17373)) [@mroeschke](https://github.com/mroeschke) +- Deselect failing polars tests ([#17362](https://github.com/rapidsai/cudf/pull/17362)) [@pentschev](https://github.com/pentschev) +- Fix integer overflow in compiled binaryop ([#17354](https://github.com/rapidsai/cudf/pull/17354)) [@wence-](https://github.com/wence-) +- Update cmake to 3.28.6 in JNI Dockerfile ([#17342](https://github.com/rapidsai/cudf/pull/17342)) [@jlowe](https://github.com/jlowe) +- fix library-loading issues in editable installs ([#17338](https://github.com/rapidsai/cudf/pull/17338)) [@jameslamb](https://github.com/jameslamb) +- Bug fix: restrict lines=True to JSON format in Kafka read_gdf method ([#17333](https://github.com/rapidsai/cudf/pull/17333)) [@a-hirota](https://github.com/a-hirota) +- Fix various issues with `replace` API and add support in `datetime` and `timedelta` columns ([#17331](https://github.com/rapidsai/cudf/pull/17331)) [@galipremsagar](https://github.com/galipremsagar) +- Do not exclude nanoarrow and flatbuffers from installation if statically linked ([#17322](https://github.com/rapidsai/cudf/pull/17322)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Fix reading Parquet string cols when `nrows` and `input_pass_limit` > 0 ([#17321](https://github.com/rapidsai/cudf/pull/17321)) [@mhaseeb123](https://github.com/mhaseeb123) +- Remove another reference to `FindcuFile` ([#17315](https://github.com/rapidsai/cudf/pull/17315)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Fix reading of single-row unterminated CSV files ([#17305](https://github.com/rapidsai/cudf/pull/17305)) [@vuule](https://github.com/vuule) +- Fixed lifetime issue in ast transform tests ([#17292](https://github.com/rapidsai/cudf/pull/17292)) [@lamarrr](https://github.com/lamarrr) +- Switch to using `TaskSpec` ([#17285](https://github.com/rapidsai/cudf/pull/17285)) [@galipremsagar](https://github.com/galipremsagar) +- Fix data_type ctor call in JSON_TEST ([#17273](https://github.com/rapidsai/cudf/pull/17273)) [@davidwendt](https://github.com/davidwendt) +- Expose delimiter character in JSON reader options to JSON reader APIs ([#17266](https://github.com/rapidsai/cudf/pull/17266)) [@shrshi](https://github.com/shrshi) +- Fix extract-datetime deprecation warning in ndsh benchmark ([#17254](https://github.com/rapidsai/cudf/pull/17254)) [@davidwendt](https://github.com/davidwendt) +- Disallow cuda-python 12.6.1 and 11.8.4 ([#17253](https://github.com/rapidsai/cudf/pull/17253)) [@bdice](https://github.com/bdice) +- Wrap custom iterator result ([#17251](https://github.com/rapidsai/cudf/pull/17251)) [@galipremsagar](https://github.com/galipremsagar) +- Fix binop with LHS numpy datetimelike scalar ([#17226](https://github.com/rapidsai/cudf/pull/17226)) [@mroeschke](https://github.com/mroeschke) +- Fix `Dataframe.__setitem__` slow-downs ([#17222](https://github.com/rapidsai/cudf/pull/17222)) [@galipremsagar](https://github.com/galipremsagar) +- Fix groupby.get_group with length-1 tuple with list-like grouper ([#17216](https://github.com/rapidsai/cudf/pull/17216)) [@mroeschke](https://github.com/mroeschke) +- Fix discoverability of submodules inside `pd.util` ([#17215](https://github.com/rapidsai/cudf/pull/17215)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `Schema.Builder` does not propagate precision value to `Builder` instance ([#17214](https://github.com/rapidsai/cudf/pull/17214)) [@ttnghia](https://github.com/ttnghia) +- Mark column chunks in a PQ reader `pass` as large strings when the cumulative `offsets` exceeds the large strings threshold. ([#17207](https://github.com/rapidsai/cudf/pull/17207)) [@mhaseeb123](https://github.com/mhaseeb123) +- [BUG] Replace `repo_token` with `github_token` in Auto Assign PR GHA ([#17203](https://github.com/rapidsai/cudf/pull/17203)) [@Matt711](https://github.com/Matt711) +- Remove unsanitized nulls from input strings columns in reduction gtests ([#17202](https://github.com/rapidsai/cudf/pull/17202)) [@davidwendt](https://github.com/davidwendt) +- Fix ``to_parquet`` append behavior with global metadata file ([#17198](https://github.com/rapidsai/cudf/pull/17198)) [@rjzamora](https://github.com/rjzamora) +- Check `num_children() == 0` in `Column.from_column_view` ([#17193](https://github.com/rapidsai/cudf/pull/17193)) [@cwharris](https://github.com/cwharris) +- Fix host-to-device copy missing sync in strings/duration convert ([#17149](https://github.com/rapidsai/cudf/pull/17149)) [@davidwendt](https://github.com/davidwendt) +- Add JNI Support for Multi-line Delimiters and Include Test ([#17139](https://github.com/rapidsai/cudf/pull/17139)) [@SurajAralihalli](https://github.com/SurajAralihalli) +- Ignore loud dask warnings about legacy dataframe implementation ([#17137](https://github.com/rapidsai/cudf/pull/17137)) [@galipremsagar](https://github.com/galipremsagar) +- Fix the GDS read/write segfault/bus error when the cuFile policy is set to GDS or ALWAYS ([#17122](https://github.com/rapidsai/cudf/pull/17122)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Fix `DataFrame._from_arrays` and introduce validations ([#17112](https://github.com/rapidsai/cudf/pull/17112)) [@galipremsagar](https://github.com/galipremsagar) +- [Bug] Fix Arrow-FS parquet reader for larger files ([#17099](https://github.com/rapidsai/cudf/pull/17099)) [@rjzamora](https://github.com/rjzamora) +- Fix bug in recovering invalid lines in JSONL inputs ([#17098](https://github.com/rapidsai/cudf/pull/17098)) [@shrshi](https://github.com/shrshi) +- Reenable huge pages for arrow host copying ([#17097](https://github.com/rapidsai/cudf/pull/17097)) [@vyasr](https://github.com/vyasr) +- Correctly set `is_device_accesible` when creating `host_span`s from other container/span types ([#17079](https://github.com/rapidsai/cudf/pull/17079)) [@vuule](https://github.com/vuule) +- Fix ORC reader when using `device_read_async` while the destination device buffers are not ready ([#17074](https://github.com/rapidsai/cudf/pull/17074)) [@ttnghia](https://github.com/ttnghia) +- Fix regex handling of fixed quantifier with 0 range ([#17067](https://github.com/rapidsai/cudf/pull/17067)) [@davidwendt](https://github.com/davidwendt) +- Limit the number of keys to calculate column sizes and page starts in PQ reader to 1B ([#17059](https://github.com/rapidsai/cudf/pull/17059)) [@mhaseeb123](https://github.com/mhaseeb123) +- Adding assertion to check for regular JSON inputs of size greater than `INT_MAX` bytes ([#17057](https://github.com/rapidsai/cudf/pull/17057)) [@shrshi](https://github.com/shrshi) +- bug fix: use `self.ck_consumer` in `poll` method of kafka.py to align with `__init__` ([#17044](https://github.com/rapidsai/cudf/pull/17044)) [@a-hirota](https://github.com/a-hirota) +- Disable kvikio remote I/O to avoid openssl dependencies in JNI build ([#17026](https://github.com/rapidsai/cudf/pull/17026)) [@pxLi](https://github.com/pxLi) +- Fix `host_span` constructor to correctly copy `is_device_accessible` ([#17020](https://github.com/rapidsai/cudf/pull/17020)) [@vuule](https://github.com/vuule) +- Add pinning for pyarrow in wheels ([#17018](https://github.com/rapidsai/cudf/pull/17018)) [@vyasr](https://github.com/vyasr) +- Use std::optional for host types ([#17015](https://github.com/rapidsai/cudf/pull/17015)) [@robertmaynard](https://github.com/robertmaynard) +- Fix write_json to handle empty string column ([#16995](https://github.com/rapidsai/cudf/pull/16995)) [@karthikeyann](https://github.com/karthikeyann) +- Restore export of nvcomp outside of wheel builds ([#16988](https://github.com/rapidsai/cudf/pull/16988)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Allow melt(var_name=) to be a falsy label ([#16981](https://github.com/rapidsai/cudf/pull/16981)) [@mroeschke](https://github.com/mroeschke) +- Fix astype from tz-aware type to tz-aware type ([#16980](https://github.com/rapidsai/cudf/pull/16980)) [@mroeschke](https://github.com/mroeschke) +- Use `libcudf` wheel from PR rather than nightly for `polars-polars` CI test job ([#16975](https://github.com/rapidsai/cudf/pull/16975)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix order-preservation in pandas-compat unsorted groupby ([#16942](https://github.com/rapidsai/cudf/pull/16942)) [@wence-](https://github.com/wence-) +- Fix cudf::strings::findall error with empty input ([#16928](https://github.com/rapidsai/cudf/pull/16928)) [@davidwendt](https://github.com/davidwendt) +- Fix JsonLargeReaderTest.MultiBatch use of LIBCUDF_JSON_BATCH_SIZE env var ([#16927](https://github.com/rapidsai/cudf/pull/16927)) [@davidwendt](https://github.com/davidwendt) +- Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter ([#16923](https://github.com/rapidsai/cudf/pull/16923)) [@shrshi](https://github.com/shrshi) +- Respect groupby.nunique(dropna=False) ([#16921](https://github.com/rapidsai/cudf/pull/16921)) [@mroeschke](https://github.com/mroeschke) +- Update all rmm imports to use pylibrmm/librmm ([#16913](https://github.com/rapidsai/cudf/pull/16913)) [@Matt711](https://github.com/Matt711) +- Fix order-preservation in cudf-polars groupby ([#16907](https://github.com/rapidsai/cudf/pull/16907)) [@wence-](https://github.com/wence-) +- Add a shortcut for when the input clusters are all empty for the tdigest merge ([#16897](https://github.com/rapidsai/cudf/pull/16897)) [@jihoonson](https://github.com/jihoonson) +- Properly handle the mapped and registered regions in `memory_mapped_source` ([#16865](https://github.com/rapidsai/cudf/pull/16865)) [@vuule](https://github.com/vuule) +- Fix performance regression for generate_character_ngrams ([#16849](https://github.com/rapidsai/cudf/pull/16849)) [@davidwendt](https://github.com/davidwendt) +- Fix regex parsing logic handling of nested quantifiers ([#16798](https://github.com/rapidsai/cudf/pull/16798)) [@davidwendt](https://github.com/davidwendt) +- Compute whole column variance using numerically stable approach ([#16448](https://github.com/rapidsai/cudf/pull/16448)) [@wence-](https://github.com/wence-) + +## 📖 Documentation + +- Add documentation for low memory readers ([#17314](https://github.com/rapidsai/cudf/pull/17314)) [@btepera](https://github.com/btepera) +- Fix the example in documentation for `get_dremel_data()` ([#17242](https://github.com/rapidsai/cudf/pull/17242)) [@mhaseeb123](https://github.com/mhaseeb123) +- Fix some documentation rendering for pylibcudf ([#17217](https://github.com/rapidsai/cudf/pull/17217)) [@mroeschke](https://github.com/mroeschke) +- Move detail header floating_conversion.hpp to detail subdirectory ([#17209](https://github.com/rapidsai/cudf/pull/17209)) [@davidwendt](https://github.com/davidwendt) +- Add TokenizeVocabulary to api docs ([#17208](https://github.com/rapidsai/cudf/pull/17208)) [@davidwendt](https://github.com/davidwendt) +- Add jaccard_index to generated cuDF docs ([#17199](https://github.com/rapidsai/cudf/pull/17199)) [@davidwendt](https://github.com/davidwendt) +- [no ci] Add empty-columns section to the libcudf developer guide ([#17183](https://github.com/rapidsai/cudf/pull/17183)) [@davidwendt](https://github.com/davidwendt) +- Add 2-cpp approvers text to contributing guide [no ci] ([#17182](https://github.com/rapidsai/cudf/pull/17182)) [@davidwendt](https://github.com/davidwendt) +- Changing developer guide int_64_t to int64_t ([#17130](https://github.com/rapidsai/cudf/pull/17130)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- docs: change 'CSV' to 'csv' in python/custreamz/README.md to match kafka.py ([#17041](https://github.com/rapidsai/cudf/pull/17041)) [@a-hirota](https://github.com/a-hirota) +- [DOC] Document limitation using `cudf.pandas` proxy arrays ([#16955](https://github.com/rapidsai/cudf/pull/16955)) [@Matt711](https://github.com/Matt711) +- [DOC] Document environment variable for failing on fallback in `cudf.pandas` ([#16932](https://github.com/rapidsai/cudf/pull/16932)) [@Matt711](https://github.com/Matt711) + +## 🚀 New Features + +- Add version config ([#17312](https://github.com/rapidsai/cudf/pull/17312)) [@vyasr](https://github.com/vyasr) +- Java JNI for Multiple contains ([#17281](https://github.com/rapidsai/cudf/pull/17281)) [@res-life](https://github.com/res-life) +- Add `cudf::calendrical_month_sequence` to pylibcudf ([#17277](https://github.com/rapidsai/cudf/pull/17277)) [@Matt711](https://github.com/Matt711) +- Raise errors on specific types of fallback in `cudf.pandas` ([#17268](https://github.com/rapidsai/cudf/pull/17268)) [@Matt711](https://github.com/Matt711) +- Add `catboost` to the third-party integration tests ([#17267](https://github.com/rapidsai/cudf/pull/17267)) [@Matt711](https://github.com/Matt711) +- Add type stubs for pylibcudf ([#17258](https://github.com/rapidsai/cudf/pull/17258)) [@wence-](https://github.com/wence-) +- Use pylibcudf contiguous split APIs in cudf python ([#17246](https://github.com/rapidsai/cudf/pull/17246)) [@Matt711](https://github.com/Matt711) +- Upgrade nvcomp to 4.1.0.6 ([#17201](https://github.com/rapidsai/cudf/pull/17201)) [@bdice](https://github.com/bdice) +- Added Arrow Interop Benchmarks ([#17194](https://github.com/rapidsai/cudf/pull/17194)) [@lamarrr](https://github.com/lamarrr) +- Rewrite Java API `Table.readJSON` to return the output from libcudf `read_json` directly ([#17180](https://github.com/rapidsai/cudf/pull/17180)) [@ttnghia](https://github.com/ttnghia) +- Support storing `precision` of decimal types in `Schema` class ([#17176](https://github.com/rapidsai/cudf/pull/17176)) [@ttnghia](https://github.com/ttnghia) +- Migrate CSV writer to pylibcudf ([#17163](https://github.com/rapidsai/cudf/pull/17163)) [@Matt711](https://github.com/Matt711) +- Add compute_shared_memory_aggs used by shared memory groupby ([#17162](https://github.com/rapidsai/cudf/pull/17162)) [@PointKernel](https://github.com/PointKernel) +- Added ast tree to simplify expression lifetime management ([#17156](https://github.com/rapidsai/cudf/pull/17156)) [@lamarrr](https://github.com/lamarrr) +- Add compute_mapping_indices used by shared memory groupby ([#17147](https://github.com/rapidsai/cudf/pull/17147)) [@PointKernel](https://github.com/PointKernel) +- Add remaining datetime APIs to pylibcudf ([#17143](https://github.com/rapidsai/cudf/pull/17143)) [@Matt711](https://github.com/Matt711) +- Added strings AST vs BINARY_OP benchmarks ([#17128](https://github.com/rapidsai/cudf/pull/17128)) [@lamarrr](https://github.com/lamarrr) +- Use `libcudf_exception_handler` throughout `pylibcudf.libcudf` ([#17109](https://github.com/rapidsai/cudf/pull/17109)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Include timezone file path in error message ([#17102](https://github.com/rapidsai/cudf/pull/17102)) [@bdice](https://github.com/bdice) +- Migrate NVText Byte Pair Encoding APIs to pylibcudf ([#17101](https://github.com/rapidsai/cudf/pull/17101)) [@Matt711](https://github.com/Matt711) +- Migrate NVText Tokenizing APIs to pylibcudf ([#17100](https://github.com/rapidsai/cudf/pull/17100)) [@Matt711](https://github.com/Matt711) +- Migrate NVtext subword tokenizing APIs to pylibcudf ([#17096](https://github.com/rapidsai/cudf/pull/17096)) [@Matt711](https://github.com/Matt711) +- Migrate NVText Stemming APIs to pylibcudf ([#17085](https://github.com/rapidsai/cudf/pull/17085)) [@Matt711](https://github.com/Matt711) +- Migrate NVText Replacing APIs to pylibcudf ([#17084](https://github.com/rapidsai/cudf/pull/17084)) [@Matt711](https://github.com/Matt711) +- Add IWYU to CI ([#17078](https://github.com/rapidsai/cudf/pull/17078)) [@vyasr](https://github.com/vyasr) +- `cudf-polars` string/numeric casting ([#17076](https://github.com/rapidsai/cudf/pull/17076)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Migrate NVText Normalizing APIs to Pylibcudf ([#17072](https://github.com/rapidsai/cudf/pull/17072)) [@Matt711](https://github.com/Matt711) +- Migrate remaining nvtext NGrams APIs to pylibcudf ([#17070](https://github.com/rapidsai/cudf/pull/17070)) [@Matt711](https://github.com/Matt711) +- Add profilers to CUDA 12 conda devcontainers ([#17066](https://github.com/rapidsai/cudf/pull/17066)) [@vyasr](https://github.com/vyasr) +- Add conda recipe for cudf-polars ([#17037](https://github.com/rapidsai/cudf/pull/17037)) [@bdice](https://github.com/bdice) +- Implement batch construction for strings columns ([#17035](https://github.com/rapidsai/cudf/pull/17035)) [@ttnghia](https://github.com/ttnghia) +- Add device aggregators used by shared memory groupby ([#17031](https://github.com/rapidsai/cudf/pull/17031)) [@PointKernel](https://github.com/PointKernel) +- Add optional column_order in JSON reader ([#17029](https://github.com/rapidsai/cudf/pull/17029)) [@karthikeyann](https://github.com/karthikeyann) +- Migrate Min Hashing APIs to pylibcudf ([#17021](https://github.com/rapidsai/cudf/pull/17021)) [@Matt711](https://github.com/Matt711) +- Reorganize `cudf_polars` expression code ([#17014](https://github.com/rapidsai/cudf/pull/17014)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Migrate nvtext jaccard API to pylibcudf ([#17007](https://github.com/rapidsai/cudf/pull/17007)) [@Matt711](https://github.com/Matt711) +- Migrate nvtext generate_ngrams APIs to pylibcudf ([#17006](https://github.com/rapidsai/cudf/pull/17006)) [@Matt711](https://github.com/Matt711) +- Control whether a file data source memory-maps the file with an environment variable ([#17004](https://github.com/rapidsai/cudf/pull/17004)) [@vuule](https://github.com/vuule) +- Switched BINARY_OP Benchmarks from GoogleBench to NVBench ([#16963](https://github.com/rapidsai/cudf/pull/16963)) [@lamarrr](https://github.com/lamarrr) +- [FEA] Report all unsupported operations for a query in cudf.polars ([#16960](https://github.com/rapidsai/cudf/pull/16960)) [@Matt711](https://github.com/Matt711) +- [FEA] Migrate nvtext/edit_distance APIs to pylibcudf ([#16957](https://github.com/rapidsai/cudf/pull/16957)) [@Matt711](https://github.com/Matt711) +- Switched AST benchmarks from GoogleBench to NVBench ([#16952](https://github.com/rapidsai/cudf/pull/16952)) [@lamarrr](https://github.com/lamarrr) +- Extend `device_scalar` to optionally use pinned bounce buffer ([#16947](https://github.com/rapidsai/cudf/pull/16947)) [@vuule](https://github.com/vuule) +- Implement `cudf-polars` chunked parquet reading ([#16944](https://github.com/rapidsai/cudf/pull/16944)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Expose streams in public round APIs ([#16925](https://github.com/rapidsai/cudf/pull/16925)) [@Matt711](https://github.com/Matt711) +- add telemetry setup to test ([#16924](https://github.com/rapidsai/cudf/pull/16924)) [@msarahan](https://github.com/msarahan) +- Add cudf::strings::contains_multiple ([#16900](https://github.com/rapidsai/cudf/pull/16900)) [@davidwendt](https://github.com/davidwendt) +- Made cudftestutil header-only and removed GTest dependency ([#16839](https://github.com/rapidsai/cudf/pull/16839)) [@lamarrr](https://github.com/lamarrr) +- Add an example to demonstrate multithreaded `read_parquet` pipelines ([#16828](https://github.com/rapidsai/cudf/pull/16828)) [@mhaseeb123](https://github.com/mhaseeb123) +- Implement `extract_datetime_component` in `libcudf`/`pylibcudf` ([#16776](https://github.com/rapidsai/cudf/pull/16776)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add cudf::strings::find_re API ([#16742](https://github.com/rapidsai/cudf/pull/16742)) [@davidwendt](https://github.com/davidwendt) +- Migrate hashing operations to `pylibcudf` ([#15418](https://github.com/rapidsai/cudf/pull/15418)) [@brandon-b-miller](https://github.com/brandon-b-miller) + +## 🛠️ Improvements + +- Simplify serialization protocols ([#17552](https://github.com/rapidsai/cudf/pull/17552)) [@vyasr](https://github.com/vyasr) +- Add `pynvml` as a dependency for `dask-cudf` ([#17386](https://github.com/rapidsai/cudf/pull/17386)) [@pentschev](https://github.com/pentschev) +- Enable unified memory by default in `cudf_polars` ([#17375](https://github.com/rapidsai/cudf/pull/17375)) [@galipremsagar](https://github.com/galipremsagar) +- Support polars 1.14 ([#17355](https://github.com/rapidsai/cudf/pull/17355)) [@wence-](https://github.com/wence-) +- Remove cudf._lib.quantiles in favor of inlining pylibcudf ([#17347](https://github.com/rapidsai/cudf/pull/17347)) [@mroeschke](https://github.com/mroeschke) +- Remove cudf._lib.labeling in favor of inlining pylibcudf ([#17346](https://github.com/rapidsai/cudf/pull/17346)) [@mroeschke](https://github.com/mroeschke) +- Remove cudf._lib.hash in favor of inlining pylibcudf ([#17345](https://github.com/rapidsai/cudf/pull/17345)) [@mroeschke](https://github.com/mroeschke) +- Remove cudf._lib.concat in favor of inlining pylibcudf ([#17344](https://github.com/rapidsai/cudf/pull/17344)) [@mroeschke](https://github.com/mroeschke) +- Extract ``GPUEngine`` config options at translation time ([#17339](https://github.com/rapidsai/cudf/pull/17339)) [@rjzamora](https://github.com/rjzamora) +- Update java datetime APIs to match CUDF. ([#17329](https://github.com/rapidsai/cudf/pull/17329)) [@revans2](https://github.com/revans2) +- Move strings url_decode benchmarks to nvbench ([#17328](https://github.com/rapidsai/cudf/pull/17328)) [@davidwendt](https://github.com/davidwendt) +- Move strings translate benchmarks to nvbench ([#17325](https://github.com/rapidsai/cudf/pull/17325)) [@davidwendt](https://github.com/davidwendt) +- Writing compressed output using JSON writer ([#17323](https://github.com/rapidsai/cudf/pull/17323)) [@shrshi](https://github.com/shrshi) +- Test the full matrix for polars and dask wheels on nightlies ([#17320](https://github.com/rapidsai/cudf/pull/17320)) [@vyasr](https://github.com/vyasr) +- Remove cudf._lib.avro in favor of inlining pylicudf ([#17319](https://github.com/rapidsai/cudf/pull/17319)) [@mroeschke](https://github.com/mroeschke) +- Move cudf._lib.unary to cudf.core._internals ([#17318](https://github.com/rapidsai/cudf/pull/17318)) [@mroeschke](https://github.com/mroeschke) +- prefer wheel-provided libcudf.so in load_library(), use RTLD_LOCAL ([#17316](https://github.com/rapidsai/cudf/pull/17316)) [@jameslamb](https://github.com/jameslamb) +- Clean up misc, unneeded pylibcudf.libcudf in cudf._lib ([#17309](https://github.com/rapidsai/cudf/pull/17309)) [@mroeschke](https://github.com/mroeschke) +- Exclude nanoarrow and flatbuffers from installation ([#17308](https://github.com/rapidsai/cudf/pull/17308)) [@vyasr](https://github.com/vyasr) +- Update CI jobs to include Polars in nightlies and improve IWYU ([#17306](https://github.com/rapidsai/cudf/pull/17306)) [@vyasr](https://github.com/vyasr) +- Move strings repeat benchmarks to nvbench ([#17304](https://github.com/rapidsai/cudf/pull/17304)) [@davidwendt](https://github.com/davidwendt) +- Fix synchronization bug in bool parquet mukernels ([#17302](https://github.com/rapidsai/cudf/pull/17302)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Move strings replace benchmarks to nvbench ([#17301](https://github.com/rapidsai/cudf/pull/17301)) [@davidwendt](https://github.com/davidwendt) +- Support polars 1.13 ([#17299](https://github.com/rapidsai/cudf/pull/17299)) [@wence-](https://github.com/wence-) +- Replace FindcuFile with upstream FindCUDAToolkit support ([#17298](https://github.com/rapidsai/cudf/pull/17298)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Expose stream-ordering in public transpose API ([#17294](https://github.com/rapidsai/cudf/pull/17294)) [@shrshi](https://github.com/shrshi) +- Replace workaround of JNI build with CUDF_KVIKIO_REMOTE_IO=OFF ([#17293](https://github.com/rapidsai/cudf/pull/17293)) [@pxLi](https://github.com/pxLi) +- cmake option: `CUDF_KVIKIO_REMOTE_IO` ([#17291](https://github.com/rapidsai/cudf/pull/17291)) [@madsbk](https://github.com/madsbk) +- Use more pylibcudf Python enums in cudf._lib ([#17288](https://github.com/rapidsai/cudf/pull/17288)) [@mroeschke](https://github.com/mroeschke) +- Use pylibcudf enums in cudf Python quantile ([#17287](https://github.com/rapidsai/cudf/pull/17287)) [@mroeschke](https://github.com/mroeschke) +- enforce wheel size limits, README formatting in CI ([#17284](https://github.com/rapidsai/cudf/pull/17284)) [@jameslamb](https://github.com/jameslamb) +- Use numba-cuda<0.0.18 ([#17280](https://github.com/rapidsai/cudf/pull/17280)) [@gmarkall](https://github.com/gmarkall) +- Add compute_column_expression to pylibcudf for transform.compute_column ([#17279](https://github.com/rapidsai/cudf/pull/17279)) [@mroeschke](https://github.com/mroeschke) +- Optimize distinct inner join to use set `find` instead of `retrieve` ([#17278](https://github.com/rapidsai/cudf/pull/17278)) [@PointKernel](https://github.com/PointKernel) +- remove WheelHelpers.cmake ([#17276](https://github.com/rapidsai/cudf/pull/17276)) [@jameslamb](https://github.com/jameslamb) +- Plumb pylibcudf datetime APIs through cudf python ([#17275](https://github.com/rapidsai/cudf/pull/17275)) [@Matt711](https://github.com/Matt711) +- Follow up making Python tests more deterministic ([#17272](https://github.com/rapidsai/cudf/pull/17272)) [@mroeschke](https://github.com/mroeschke) +- Use pylibcudf.search APIs in cudf python ([#17271](https://github.com/rapidsai/cudf/pull/17271)) [@Matt711](https://github.com/Matt711) +- Use `pylibcudf.strings.convert.convert_integers.is_integer` in cudf python ([#17270](https://github.com/rapidsai/cudf/pull/17270)) [@Matt711](https://github.com/Matt711) +- Move strings filter benchmarks to nvbench ([#17269](https://github.com/rapidsai/cudf/pull/17269)) [@davidwendt](https://github.com/davidwendt) +- Make constructor of DeviceMemoryBufferView public ([#17265](https://github.com/rapidsai/cudf/pull/17265)) [@liurenjie1024](https://github.com/liurenjie1024) +- Put a ceiling on cuda-python ([#17264](https://github.com/rapidsai/cudf/pull/17264)) [@jameslamb](https://github.com/jameslamb) +- Always prefer `device_read`s and `device_write`s when kvikIO is enabled ([#17260](https://github.com/rapidsai/cudf/pull/17260)) [@vuule](https://github.com/vuule) +- Expose streams in public quantile APIs ([#17257](https://github.com/rapidsai/cudf/pull/17257)) [@shrshi](https://github.com/shrshi) +- Add support for `pyarrow-18` ([#17256](https://github.com/rapidsai/cudf/pull/17256)) [@galipremsagar](https://github.com/galipremsagar) +- Move strings/numeric convert benchmarks to nvbench ([#17255](https://github.com/rapidsai/cudf/pull/17255)) [@davidwendt](https://github.com/davidwendt) +- Add new ``dask_cudf.read_parquet`` API ([#17250](https://github.com/rapidsai/cudf/pull/17250)) [@rjzamora](https://github.com/rjzamora) +- Add read_parquet_metadata to pylibcudf ([#17245](https://github.com/rapidsai/cudf/pull/17245)) [@mroeschke](https://github.com/mroeschke) +- Search for kvikio with lowercase ([#17243](https://github.com/rapidsai/cudf/pull/17243)) [@vyasr](https://github.com/vyasr) +- KvikIO shared library ([#17239](https://github.com/rapidsai/cudf/pull/17239)) [@madsbk](https://github.com/madsbk) +- Use more pylibcudf.io.types enums in cudf._libs ([#17237](https://github.com/rapidsai/cudf/pull/17237)) [@mroeschke](https://github.com/mroeschke) +- Expose mixed and conditional joins in pylibcudf ([#17235](https://github.com/rapidsai/cudf/pull/17235)) [@wence-](https://github.com/wence-) +- Add io.text APIs to pylibcudf ([#17232](https://github.com/rapidsai/cudf/pull/17232)) [@mroeschke](https://github.com/mroeschke) +- Add `num_iterations` axis to the multi-threaded Parquet benchmarks ([#17231](https://github.com/rapidsai/cudf/pull/17231)) [@vuule](https://github.com/vuule) +- Move strings to date/time types benchmarks to nvbench ([#17229](https://github.com/rapidsai/cudf/pull/17229)) [@davidwendt](https://github.com/davidwendt) +- Support for polars 1.12 in cudf-polars ([#17227](https://github.com/rapidsai/cudf/pull/17227)) [@wence-](https://github.com/wence-) +- Allow generating large strings in benchmarks ([#17224](https://github.com/rapidsai/cudf/pull/17224)) [@davidwendt](https://github.com/davidwendt) +- Refactor gather/scatter benchmarks for strings ([#17223](https://github.com/rapidsai/cudf/pull/17223)) [@davidwendt](https://github.com/davidwendt) +- Deprecate single component extraction methods in libcudf ([#17221](https://github.com/rapidsai/cudf/pull/17221)) [@Matt711](https://github.com/Matt711) +- Remove `nvtext::load_vocabulary` from pylibcudf ([#17220](https://github.com/rapidsai/cudf/pull/17220)) [@Matt711](https://github.com/Matt711) +- Benchmarking JSON reader for compressed inputs ([#17219](https://github.com/rapidsai/cudf/pull/17219)) [@shrshi](https://github.com/shrshi) +- Expose stream-ordering in partitioning API ([#17213](https://github.com/rapidsai/cudf/pull/17213)) [@shrshi](https://github.com/shrshi) +- Move strings::concatenate benchmark to nvbench ([#17211](https://github.com/rapidsai/cudf/pull/17211)) [@davidwendt](https://github.com/davidwendt) +- Expose stream-ordering in subword tokenizer API ([#17206](https://github.com/rapidsai/cudf/pull/17206)) [@shrshi](https://github.com/shrshi) +- Refactor Dask cuDF legacy code ([#17205](https://github.com/rapidsai/cudf/pull/17205)) [@rjzamora](https://github.com/rjzamora) +- Make HostMemoryBuffer call into the DefaultHostMemoryAllocator ([#17204](https://github.com/rapidsai/cudf/pull/17204)) [@revans2](https://github.com/revans2) +- Unified binary_ops and ast benchmarks parameter names ([#17200](https://github.com/rapidsai/cudf/pull/17200)) [@lamarrr](https://github.com/lamarrr) +- Add in new java API for raw host memory allocation ([#17197](https://github.com/rapidsai/cudf/pull/17197)) [@revans2](https://github.com/revans2) +- Remove java reservation ([#17189](https://github.com/rapidsai/cudf/pull/17189)) [@revans2](https://github.com/revans2) +- Fixed unused attribute compilation error for GCC 13 ([#17188](https://github.com/rapidsai/cudf/pull/17188)) [@lamarrr](https://github.com/lamarrr) +- Change default KvikIO parameters in cuDF: set the thread pool size to 4, and compatibility mode to ON ([#17185](https://github.com/rapidsai/cudf/pull/17185)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Use make_device_uvector instead of cudaMemcpyAsync in inplace_bitmask_binop ([#17181](https://github.com/rapidsai/cudf/pull/17181)) [@davidwendt](https://github.com/davidwendt) +- Make ai.rapids.cudf.HostMemoryBuffer#copyFromStream public. ([#17179](https://github.com/rapidsai/cudf/pull/17179)) [@liurenjie1024](https://github.com/liurenjie1024) +- Separate evaluation logic from `IR` objects in cudf-polars ([#17175](https://github.com/rapidsai/cudf/pull/17175)) [@rjzamora](https://github.com/rjzamora) +- Move nvtext ngrams benchmarks to nvbench ([#17173](https://github.com/rapidsai/cudf/pull/17173)) [@davidwendt](https://github.com/davidwendt) +- Remove includes suggested by include-what-you-use ([#17170](https://github.com/rapidsai/cudf/pull/17170)) [@vyasr](https://github.com/vyasr) +- Reading multi-source compressed JSONL files ([#17161](https://github.com/rapidsai/cudf/pull/17161)) [@shrshi](https://github.com/shrshi) +- Process parquet bools with microkernels ([#17157](https://github.com/rapidsai/cudf/pull/17157)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Upgrade to polars 1.11 in cudf-polars ([#17154](https://github.com/rapidsai/cudf/pull/17154)) [@wence-](https://github.com/wence-) +- Deprecate current libcudf nvtext minhash functions ([#17152](https://github.com/rapidsai/cudf/pull/17152)) [@davidwendt](https://github.com/davidwendt) +- Remove unused variable in internal merge_tdigests utility ([#17151](https://github.com/rapidsai/cudf/pull/17151)) [@davidwendt](https://github.com/davidwendt) +- Use the full ref name of `rmm.DeviceBuffer` in the sphinx config file ([#17150](https://github.com/rapidsai/cudf/pull/17150)) [@Matt711](https://github.com/Matt711) +- Move `segmented_gather` function from the copying module to the lists module ([#17148](https://github.com/rapidsai/cudf/pull/17148)) [@Matt711](https://github.com/Matt711) +- Use async execution policy for true_if ([#17146](https://github.com/rapidsai/cudf/pull/17146)) [@PointKernel](https://github.com/PointKernel) +- Add conversion from cudf-polars expressions to libcudf ast for parquet filters ([#17141](https://github.com/rapidsai/cudf/pull/17141)) [@wence-](https://github.com/wence-) +- devcontainer: replace `VAULT_HOST` with `AWS_ROLE_ARN` ([#17134](https://github.com/rapidsai/cudf/pull/17134)) [@jjacobelli](https://github.com/jjacobelli) +- Replace direct `cudaMemcpyAsync` calls with utility functions (limited to `cudf::io`) ([#17132](https://github.com/rapidsai/cudf/pull/17132)) [@vuule](https://github.com/vuule) +- use rapids-generate-pip-constraints to pin to oldest dependencies in CI ([#17131](https://github.com/rapidsai/cudf/pull/17131)) [@jameslamb](https://github.com/jameslamb) +- Set the default number of threads in KvikIO thread pool to 8 ([#17126](https://github.com/rapidsai/cudf/pull/17126)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Fix clang-tidy violations for span.hpp and hostdevice_vector.hpp ([#17124](https://github.com/rapidsai/cudf/pull/17124)) [@davidwendt](https://github.com/davidwendt) +- Disable the Parquet reader's wide lists tables GTest by default ([#17120](https://github.com/rapidsai/cudf/pull/17120)) [@mhaseeb123](https://github.com/mhaseeb123) +- Add compile time check to ensure the `counting_iterator` type in `counting_transform_iterator` fits in `size_type` ([#17118](https://github.com/rapidsai/cudf/pull/17118)) [@mhaseeb123](https://github.com/mhaseeb123) +- Minor I/O code quality improvements ([#17105](https://github.com/rapidsai/cudf/pull/17105)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Remove the additional host register calls initially intended for performance improvement on Grace Hopper ([#17092](https://github.com/rapidsai/cudf/pull/17092)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Split hash-based groupby into multiple smaller files to reduce build time ([#17089](https://github.com/rapidsai/cudf/pull/17089)) [@PointKernel](https://github.com/PointKernel) +- build wheels without build isolation ([#17088](https://github.com/rapidsai/cudf/pull/17088)) [@jameslamb](https://github.com/jameslamb) +- Polars: DataFrame Serialization ([#17062](https://github.com/rapidsai/cudf/pull/17062)) [@madsbk](https://github.com/madsbk) +- Remove unused hash helper functions ([#17056](https://github.com/rapidsai/cudf/pull/17056)) [@PointKernel](https://github.com/PointKernel) +- Add to_dlpack/from_dlpack APIs to pylibcudf ([#17055](https://github.com/rapidsai/cudf/pull/17055)) [@mroeschke](https://github.com/mroeschke) +- Move `flatten_single_pass_aggs` to its own TU ([#17053](https://github.com/rapidsai/cudf/pull/17053)) [@PointKernel](https://github.com/PointKernel) +- Replace deprecated cuco APIs with updated versions ([#17052](https://github.com/rapidsai/cudf/pull/17052)) [@PointKernel](https://github.com/PointKernel) +- Refactor ORC dictionary encoding to migrate to the new `cuco::static_map` ([#17049](https://github.com/rapidsai/cudf/pull/17049)) [@mhaseeb123](https://github.com/mhaseeb123) +- Move pylibcudf/libcudf/wrappers/decimals to pylibcudf/libcudf/fixed_point ([#17048](https://github.com/rapidsai/cudf/pull/17048)) [@mroeschke](https://github.com/mroeschke) +- make conda installs in CI stricter (part 2) ([#17042](https://github.com/rapidsai/cudf/pull/17042)) [@jameslamb](https://github.com/jameslamb) +- Use managed memory for NDSH benchmarks ([#17039](https://github.com/rapidsai/cudf/pull/17039)) [@karthikeyann](https://github.com/karthikeyann) +- Clean up hash-groupby `var_hash_functor` ([#17034](https://github.com/rapidsai/cudf/pull/17034)) [@PointKernel](https://github.com/PointKernel) +- Add json APIs to pylibcudf ([#17025](https://github.com/rapidsai/cudf/pull/17025)) [@mroeschke](https://github.com/mroeschke) +- Add string.replace_re APIs to pylibcudf ([#17023](https://github.com/rapidsai/cudf/pull/17023)) [@mroeschke](https://github.com/mroeschke) +- Replace old host tree algorithm with new algorithm in JSON reader ([#17019](https://github.com/rapidsai/cudf/pull/17019)) [@karthikeyann](https://github.com/karthikeyann) +- Unify treatment of `Expr` and `IR` nodes in cudf-polars DSL ([#17016](https://github.com/rapidsai/cudf/pull/17016)) [@wence-](https://github.com/wence-) +- make conda installs in CI stricter ([#17013](https://github.com/rapidsai/cudf/pull/17013)) [@jameslamb](https://github.com/jameslamb) +- Pylibcudf: pack and unpack ([#17012](https://github.com/rapidsai/cudf/pull/17012)) [@madsbk](https://github.com/madsbk) +- Remove unneeded pylibcudf.libcudf.wrappers.duration usage in cudf ([#17010](https://github.com/rapidsai/cudf/pull/17010)) [@mroeschke](https://github.com/mroeschke) +- Add custom "fused" groupby aggregation to Dask cuDF ([#17009](https://github.com/rapidsai/cudf/pull/17009)) [@rjzamora](https://github.com/rjzamora) +- Make tests more deterministic ([#17008](https://github.com/rapidsai/cudf/pull/17008)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unused import ([#17005](https://github.com/rapidsai/cudf/pull/17005)) [@Matt711](https://github.com/Matt711) +- Add string.convert.convert_urls APIs to pylibcudf ([#17003](https://github.com/rapidsai/cudf/pull/17003)) [@mroeschke](https://github.com/mroeschke) +- Add release tracking to project automation scripts ([#17001](https://github.com/rapidsai/cudf/pull/17001)) [@jarmak-nv](https://github.com/jarmak-nv) +- Implement inequality joins by translation to conditional joins ([#17000](https://github.com/rapidsai/cudf/pull/17000)) [@wence-](https://github.com/wence-) +- Add string.convert.convert_lists APIs to pylibcudf ([#16997](https://github.com/rapidsai/cudf/pull/16997)) [@mroeschke](https://github.com/mroeschke) +- Performance optimization of JSON validation ([#16996](https://github.com/rapidsai/cudf/pull/16996)) [@karthikeyann](https://github.com/karthikeyann) +- Add string.convert.convert_ipv4 APIs to pylibcudf ([#16994](https://github.com/rapidsai/cudf/pull/16994)) [@mroeschke](https://github.com/mroeschke) +- Add string.convert.convert_integers APIs to pylibcudf ([#16991](https://github.com/rapidsai/cudf/pull/16991)) [@mroeschke](https://github.com/mroeschke) +- Add string.convert_floats APIs to pylibcudf ([#16990](https://github.com/rapidsai/cudf/pull/16990)) [@mroeschke](https://github.com/mroeschke) +- Add string.convert.convert_fixed_type APIs to pylibcudf ([#16984](https://github.com/rapidsai/cudf/pull/16984)) [@mroeschke](https://github.com/mroeschke) +- Remove unnecessary `std::move`'s in pylibcudf ([#16983](https://github.com/rapidsai/cudf/pull/16983)) [@Matt711](https://github.com/Matt711) +- Add docstrings and test for strings.convert_durations APIs for pylibcudf ([#16982](https://github.com/rapidsai/cudf/pull/16982)) [@mroeschke](https://github.com/mroeschke) +- JSON tokenizer memory optimizations ([#16978](https://github.com/rapidsai/cudf/pull/16978)) [@shrshi](https://github.com/shrshi) +- Turn on `xfail_strict = true` for all python packages ([#16977](https://github.com/rapidsai/cudf/pull/16977)) [@wence-](https://github.com/wence-) +- Add string.convert.convert_datetime/convert_booleans APIs to pylibcudf ([#16971](https://github.com/rapidsai/cudf/pull/16971)) [@mroeschke](https://github.com/mroeschke) +- Auto assign PR to author ([#16969](https://github.com/rapidsai/cudf/pull/16969)) [@Matt711](https://github.com/Matt711) +- Deprecate support for directly accessing logger ([#16964](https://github.com/rapidsai/cudf/pull/16964)) [@vyasr](https://github.com/vyasr) +- Expunge NamedColumn ([#16962](https://github.com/rapidsai/cudf/pull/16962)) [@wence-](https://github.com/wence-) +- Add clang-tidy to CI ([#16958](https://github.com/rapidsai/cudf/pull/16958)) [@vyasr](https://github.com/vyasr) +- Address all remaining clang-tidy errors ([#16956](https://github.com/rapidsai/cudf/pull/16956)) [@vyasr](https://github.com/vyasr) +- Apply clang-tidy autofixes ([#16949](https://github.com/rapidsai/cudf/pull/16949)) [@vyasr](https://github.com/vyasr) +- Use nvcomp wheel instead of bundling nvcomp ([#16946](https://github.com/rapidsai/cudf/pull/16946)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Refactor the `cuda_memcpy` functions to make them more usable ([#16945](https://github.com/rapidsai/cudf/pull/16945)) [@vuule](https://github.com/vuule) +- Add string.split APIs to pylibcudf ([#16940](https://github.com/rapidsai/cudf/pull/16940)) [@mroeschke](https://github.com/mroeschke) +- clang-tidy fixes part 3 ([#16939](https://github.com/rapidsai/cudf/pull/16939)) [@vyasr](https://github.com/vyasr) +- clang-tidy fixes part 2 ([#16938](https://github.com/rapidsai/cudf/pull/16938)) [@vyasr](https://github.com/vyasr) +- clang-tidy fixes part 1 ([#16937](https://github.com/rapidsai/cudf/pull/16937)) [@vyasr](https://github.com/vyasr) +- Add string.wrap APIs to pylibcudf ([#16935](https://github.com/rapidsai/cudf/pull/16935)) [@mroeschke](https://github.com/mroeschke) +- Add string.translate APIs to pylibcudf ([#16934](https://github.com/rapidsai/cudf/pull/16934)) [@mroeschke](https://github.com/mroeschke) +- Add string.find_multiple APIs to pylibcudf ([#16920](https://github.com/rapidsai/cudf/pull/16920)) [@mroeschke](https://github.com/mroeschke) +- Batch memcpy the last offsets for output buffers of str and list cols in PQ reader ([#16905](https://github.com/rapidsai/cudf/pull/16905)) [@mhaseeb123](https://github.com/mhaseeb123) +- reduce wheel build verbosity, narrow deprecation warning filter ([#16896](https://github.com/rapidsai/cudf/pull/16896)) [@jameslamb](https://github.com/jameslamb) +- Improve aggregation device functors ([#16884](https://github.com/rapidsai/cudf/pull/16884)) [@PointKernel](https://github.com/PointKernel) +- Upgrade pandas pinnings to support `2.2.3` ([#16882](https://github.com/rapidsai/cudf/pull/16882)) [@galipremsagar](https://github.com/galipremsagar) +- Fix 24.10 to 24.12 forward merge ([#16876](https://github.com/rapidsai/cudf/pull/16876)) [@bdice](https://github.com/bdice) +- Manually resolve conflicts in between branch-24.12 and branch-24.10 ([#16871](https://github.com/rapidsai/cudf/pull/16871)) [@galipremsagar](https://github.com/galipremsagar) +- Add in support for setting delim when parsing JSON through java ([#16867](https://github.com/rapidsai/cudf/pull/16867)) [@revans2](https://github.com/revans2) +- Reapply `mixed_semi_join` refactoring and bug fixes ([#16859](https://github.com/rapidsai/cudf/pull/16859)) [@mhaseeb123](https://github.com/mhaseeb123) +- Add string padding and side_type APIs to pylibcudf ([#16833](https://github.com/rapidsai/cudf/pull/16833)) [@mroeschke](https://github.com/mroeschke) +- Organize parquet reader mukernel non-nullable code, introduce manual block scans ([#16830](https://github.com/rapidsai/cudf/pull/16830)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Remove superfluous use of std::vector for std::future ([#16829](https://github.com/rapidsai/cudf/pull/16829)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Rework `read_csv` IO to avoid reading whole input with a single `host_read` ([#16826](https://github.com/rapidsai/cudf/pull/16826)) [@vuule](https://github.com/vuule) +- Add strings.combine APIs to pylibcudf ([#16790](https://github.com/rapidsai/cudf/pull/16790)) [@mroeschke](https://github.com/mroeschke) +- Add remaining string.char_types APIs to pylibcudf ([#16788](https://github.com/rapidsai/cudf/pull/16788)) [@mroeschke](https://github.com/mroeschke) +- Add new nvtext minhash_permuted API ([#16756](https://github.com/rapidsai/cudf/pull/16756)) [@davidwendt](https://github.com/davidwendt) +- Avoid public constructors when called with columns to avoid unnecessary validation ([#16747](https://github.com/rapidsai/cudf/pull/16747)) [@mroeschke](https://github.com/mroeschke) +- Use `changed-files` shared workflow ([#16713](https://github.com/rapidsai/cudf/pull/16713)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- lint: replace `isort` with Ruff's rule I ([#16685](https://github.com/rapidsai/cudf/pull/16685)) [@Borda](https://github.com/Borda) +- Improve the performance of low cardinality groupby ([#16619](https://github.com/rapidsai/cudf/pull/16619)) [@PointKernel](https://github.com/PointKernel) +- Parquet reader list microkernel ([#16538](https://github.com/rapidsai/cudf/pull/16538)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- AWS S3 IO through KvikIO ([#16499](https://github.com/rapidsai/cudf/pull/16499)) [@madsbk](https://github.com/madsbk) +- Refactor `histogram` reduction using `cuco::static_set::insert_and_find` ([#16485](https://github.com/rapidsai/cudf/pull/16485)) [@srinivasyadav18](https://github.com/srinivasyadav18) +- Use numba-cuda>=0.0.13 ([#16474](https://github.com/rapidsai/cudf/pull/16474)) [@gmarkall](https://github.com/gmarkall) + # cudf 24.10.00 (9 Oct 2024) ## 🚨 Breaking Changes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9cdde7c2b7..3db1ed35294 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -38,6 +38,7 @@ conduct. More information can be found at: 8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks). Fix if needed. 9. Wait for other developers to review your code and update code as needed. + Changes to any C++ files require at least 2 approvals from the cudf-cpp-codeowners before merging. 10. Once reviewed and approved, a RAPIDS developer will merge your pull request. If you are unsure about anything, don't hesitate to comment on issues and ask for clarification! @@ -293,8 +294,8 @@ In order to run doxygen as a linter on C++/CUDA code, run ./ci/checks/doxygen.sh ``` -Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/), -[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/). +Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/) +with its various rules for Black-like formatting or Isort. cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes, diff --git a/README.md b/README.md index 8f8c2adac2f..169d2e4eded 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=24.10 python=3.12 cuda-version=12.5 + cudf=24.12 python=3.12 cuda-version=12.5 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION index d5450f5b0c3..af28c42b528 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -24.10.01 +24.12.00 diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index e5fcef17a83..3d06eacf9ff 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -15,8 +15,12 @@ rapids-print-env rapids-logger "Begin cpp build" +sccache --zero-stats + # With boa installed conda build forward to boa RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \ conda/recipes/libcudf +sccache --show-adv-stats + rapids-upload-conda-to-s3 cpp diff --git a/ci/build_docs.sh b/ci/build_docs.sh index c67d127e635..4290d013fe4 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -5,7 +5,6 @@ set -euo pipefail export RAPIDS_VERSION="$(rapids-version)" export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" -export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR" rapids-logger "Create test conda environment" . /opt/conda/etc/profile.d/conda.sh @@ -29,13 +28,16 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - libcudf pylibcudf cudf dask-cudf + "libcudf=${RAPIDS_VERSION}" \ + "pylibcudf=${RAPIDS_VERSION}" \ + "cudf=${RAPIDS_VERSION}" \ + "dask-cudf=${RAPIDS_VERSION}" export RAPIDS_DOCS_DIR="$(mktemp -d)" rapids-logger "Build CPP docs" pushd cpp/doxygen -aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_NUMBER}/rmm.tag . || echo "Failed to download rmm Doxygen tag" +aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag" doxygen Doxyfile mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html" mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html" @@ -55,4 +57,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html" mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" popd -rapids-upload-docs +RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs diff --git a/ci/build_python.sh b/ci/build_python.sh index 2e3f70ba767..ed90041cc77 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -19,6 +19,8 @@ rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) +sccache --zero-stats + # TODO: Remove `--no-test` flag once importing on a CPU # node works correctly # With boa installed conda build forwards to the boa builder @@ -28,12 +30,18 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --channel "${CPP_CHANNEL}" \ conda/recipes/pylibcudf +sccache --show-adv-stats +sccache --zero-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf +sccache --show-adv-stats +sccache --zero-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ @@ -46,11 +54,18 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf_kafka +sccache --show-adv-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/custreamz +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ + --no-test \ + --channel "${CPP_CHANNEL}" \ + --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ + conda/recipes/cudf-polars rapids-upload-conda-to-s3 python diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 7c1fa705faa..78b8a8a08cf 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -3,7 +3,8 @@ set -euo pipefail -package_dir=$1 +package_name=$1 +package_dir=$2 source rapids-configure-sccache source rapids-date-string @@ -12,4 +13,14 @@ rapids-generate-version > ./VERSION cd "${package_dir}" -python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check +sccache --zero-stats + +rapids-logger "Building '${package_name}' wheel" +python -m pip wheel \ + -w dist \ + -v \ + --no-deps \ + --disable-pip-version-check \ + . + +sccache --show-adv-stats diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index fb93b06dbe2..32dd5a7fa62 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -18,12 +18,15 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh cudf ${package_dir} python -m auditwheel repair \ --exclude libcudf.so \ --exclude libnvcomp.so \ + --exclude libkvikio.so \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* +./ci/validate_wheel.sh ${package_dir} final_dist + RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh index 9c945e11c00..38048125247 100755 --- a/ci/build_wheel_cudf_polars.sh +++ b/ci/build_wheel_cudf_polars.sh @@ -5,7 +5,8 @@ set -euo pipefail package_dir="python/cudf_polars" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh cudf-polars ${package_dir} +./ci/validate_wheel.sh ${package_dir} dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh index eb2a91289f7..b0ae2f23abc 100755 --- a/ci/build_wheel_dask_cudf.sh +++ b/ci/build_wheel_dask_cudf.sh @@ -5,7 +5,8 @@ set -euo pipefail package_dir="python/dask_cudf" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh dask-cudf ${package_dir} +./ci/validate_wheel.sh ${package_dir} dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist +RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index 8975381ceba..af49942c8cd 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -3,13 +3,40 @@ set -euo pipefail +package_name="libcudf" package_dir="python/libcudf" -./ci/build_wheel.sh ${package_dir} +rapids-logger "Generating build requirements" + +rapids-dependency-file-generator \ + --output requirements \ + --file-key "py_build_${package_name}" \ + --file-key "py_rapids_build_${package_name}" \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \ +| tee /tmp/requirements-build.txt + +rapids-logger "Installing build requirements" +python -m pip install \ + -v \ + --prefer-binary \ + -r /tmp/requirements-build.txt + +# build with '--no-build-isolation', for better sccache hit rate +# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735) +export PIP_NO_BUILD_ISOLATION=0 + +export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" +./ci/build_wheel.sh "${package_name}" "${package_dir}" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" mkdir -p ${package_dir}/final_dist -python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* +python -m auditwheel repair \ + --exclude libnvcomp.so.4 \ + --exclude libkvikio.so \ + -w ${package_dir}/final_dist \ + ${package_dir}/dist/* + +./ci/validate_wheel.sh ${package_dir} final_dist -RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist +RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist" diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index 5e9f7f8a0c4..5a8f3397714 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -16,12 +16,15 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh pylibcudf ${package_dir} python -m auditwheel repair \ --exclude libcudf.so \ --exclude libnvcomp.so \ + --exclude libkvikio.so \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist +./ci/validate_wheel.sh ${package_dir} final_dist + +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh new file mode 100755 index 00000000000..286c7bfbc66 --- /dev/null +++ b/ci/cpp_linters.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-logger "Create checks conda environment" +. /opt/conda/etc/profile.d/conda.sh + +ENV_YAML_DIR="$(mktemp -d)" + +rapids-dependency-file-generator \ + --output conda \ + --file-key clang_tidy \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" + +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate clang_tidy +set -u + +RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" + +source rapids-configure-sccache + +# Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled. +cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_STATIC_LINTERS=ON -GNinja +cmake --build cpp/build 2>&1 | python cpp/scripts/parse_iwyu_output.py + +# Remove invalid components of the path for local usage. The path below is +# valid in the CI due to where the project is cloned, but presumably the fixes +# will be applied locally from inside a clone of cudf. +sed -i 's/\/__w\/cudf\/cudf\///' iwyu_results.txt diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index f6bdc6f9484..61361fffb07 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -54,15 +54,8 @@ else RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist - echo "" > ./constraints.txt - if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]` - rapids-dependency-file-generator \ - --output requirements \ - --file-key test_python_cudf_pandas \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ - | tee ./constraints.txt - fi + # generate constraints (possibly pinning to oldest support versions of dependencies) + rapids-generate-pip-constraints test_python_cudf_pandas ./constraints.txt python -m pip install \ -v \ diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index f73e88bc0c8..95f36653c2c 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -82,6 +82,7 @@ for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh +sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_cudf_polars_polars_tests.sh # Java files NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT" @@ -92,6 +93,7 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md # .devcontainer files find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}" + sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}" sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}" sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}" done diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 0819eacf636..2439af5b644 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -23,7 +23,10 @@ compute-sanitizer --tool memcheck custom_optimized names.csv compute-sanitizer --tool memcheck custom_prealloc names.csv compute-sanitizer --tool memcheck custom_with_malloc names.csv -compute-sanitizer --tool memcheck parquet_io +compute-sanitizer --tool memcheck parquet_io example.parquet compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2 + exit ${EXITCODE} diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh index 95f78f17f2f..c851f65d4f6 100755 --- a/ci/run_cudf_polars_polars_tests.sh +++ b/ci/run_cudf_polars_polars_tests.sh @@ -12,8 +12,32 @@ DESELECTED_TESTS=( "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error "tests/docs/test_user_guide.py" # No dot binary in CI image + "tests/unit/test_polars_import.py::test_fork_safety" # test started to hang in polars-1.14 + "tests/unit/operations/test_join.py::test_join_4_columns_with_validity" # fails in some systems, see https://github.com/pola-rs/polars/issues/19870 + "tests/unit/io/test_csv.py::test_read_web_file" # fails in rockylinux8 due to SSL CA issues ) +if [[ $(arch) == "aarch64" ]]; then + # The binary used for TPC-H generation is compiled for x86_64, not aarch64. + DESELECTED_TESTS+=("tests/benchmark/test_pdsh.py::test_pdsh") + # The connectorx package is not available on arm + DESELECTED_TESTS+=("tests/unit/io/database/test_read.py::test_read_database") + # The necessary timezone information cannot be found in our CI image. + DESELECTED_TESTS+=("tests/unit/io/test_parquet.py::test_parametric_small_page_mask_filtering") + DESELECTED_TESTS+=("tests/unit/testing/test_assert_series_equal.py::test_assert_series_equal_parametric") + DESELECTED_TESTS+=("tests/unit/operations/test_join.py::test_join_4_columns_with_validity") +else + # Ensure that we don't run dbgen when it uses newer symbols than supported by the glibc version in the CI image. + # Allow errors since any of these commands could produce empty results that would cause the script to fail. + set +e + glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2) + latest_glibc_symbol_found=$(nm py-polars/tests/benchmark/data/pdsh/dbgen/dbgen | grep GLIBC | grep -o "[0-9]\.[0-9]\+" | sort --version-sort | tail -1 | cut -d "." -f 2) + set -e + if [[ ${glibc_minor_version} -lt ${latest_glibc_symbol_found} ]]; then + DESELECTED_TESTS+=("tests/benchmark/test_pdsh.py::test_pdsh") + fi +fi + DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}") python -m pytest \ --import-mode=importlib \ diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh index f5a8de543f6..8cd78eb11c2 100755 --- a/ci/test_cpp_common.sh +++ b/ci/test_cpp_common.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate C++ testing dependencies" ENV_YAML_DIR="$(mktemp -d)" @@ -31,7 +33,10 @@ rapids-print-env rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ - libcudf libcudf_kafka libcudf-tests libcudf-example + "libcudf=${RAPIDS_VERSION}" \ + "libcudf_kafka=${RAPIDS_VERSION}" \ + "libcudf-tests=${RAPIDS_VERSION}" \ + "libcudf-example=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index bfc8fd37565..fefe26984cb 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -3,35 +3,22 @@ set -eou pipefail -# We will only fail these tests if the PR touches code in pylibcudf -# or cudf_polars itself. -# Note, the three dots mean we are doing diff between the merge-base -# of upstream and HEAD. So this is asking, "does _this branch_ touch -# files in cudf_polars/pylibcudf", rather than "are there changes -# between upstream and this branch which touch cudf_polars/pylibcudf" -# TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; -then - HAS_CHANGES=1 - rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" -else - HAS_CHANGES=0 - rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" -fi - rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist -# Download the pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep +# Download libcudf and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep -rapids-logger "Install pylibcudf" -python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl +rapids-logger "Install libcudf, pylibcudf and cudf_polars" +python -m pip install \ + -v \ + "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ + "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" -rapids-logger "Install cudf_polars" -python -m pip install $(echo ./dist/cudf_polars*.whl) TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') rapids-logger "Clone polars to ${TAG}" @@ -60,9 +47,4 @@ if [ ${EXITCODE} != 0 ]; then else rapids-logger "Running polars test suite PASSED" fi - -if [ ${HAS_CHANGES} == 1 ]; then - exit ${EXITCODE} -else - exit 0 -fi +exit ${EXITCODE} diff --git a/ci/test_java.sh b/ci/test_java.sh index 629ad11014a..7f1aa633afc 100755 --- a/ci/test_java.sh +++ b/ci/test_java.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate Java testing dependencies" ENV_YAML_DIR="$(mktemp -d)" @@ -30,7 +32,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ - libcudf + "libcudf=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index da9478ce25d..4197dc5617f 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate notebook testing dependencies" ENV_YAML_DIR="$(mktemp -d)" @@ -30,7 +32,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - cudf libcudf + "cudf=${RAPIDS_VERSION}" \ + "libcudf=${RAPIDS_VERSION}" NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")" pushd notebooks diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index dc70661a17a..4327bfff3af 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -7,6 +7,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate Python testing dependencies" ENV_YAML_DIR="$(mktemp -d)" @@ -38,4 +40,5 @@ rapids-print-env rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - cudf libcudf + "cudf=${RAPIDS_VERSION}" \ + "libcudf=${RAPIDS_VERSION}" diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index 2386414b32e..9528549a562 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi - +rapids-print-env EXITCODE=0 trap "EXITCODE=1" ERR set +e diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index 67c97ad29a5..db86721755d 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -7,10 +7,15 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ # Common setup steps shared by Python test jobs source ./ci/test_python_common.sh test_python_other +RAPIDS_VERSION="$(rapids-version)" + rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - dask-cudf cudf_kafka custreamz + "dask-cudf=${RAPIDS_VERSION}" \ + "cudf_kafka=${RAPIDS_VERSION}" \ + "custreamz=${RAPIDS_VERSION}" \ + "cudf-polars=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi @@ -33,7 +38,7 @@ rapids-logger "pytest dask_cudf (legacy)" DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ . rapids-logger "pytest cudf_kafka" @@ -50,5 +55,19 @@ rapids-logger "pytest custreamz" --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \ --cov-report=term +# Note that cudf-polars uses rmm.mr.CudaAsyncMemoryResource() which allocates +# half the available memory. This doesn't play well with multiple workers, so +# we keep --numprocesses=1 for now. This should be resolved by +# https://github.com/rapidsai/cudf/issues/16723. +rapids-logger "pytest cudf-polars" +./ci/run_cudf_polars_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" \ + --numprocesses=1 \ + --dist=worksteal \ + --cov-config=./pyproject.toml \ + --cov=cudf_polars \ + --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-polars-coverage.xml" \ + --cov-report=term + rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index a701bfe15e0..ce12744c9e3 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -12,15 +12,8 @@ RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels rapids-logger "Install cudf, pylibcudf, and test requirements" -# Constrain to minimum dependency versions if job is set up as "oldest" -echo "" > ./constraints.txt -if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - rapids-dependency-file-generator \ - --output requirements \ - --file-key py_test_cudf \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ - | tee ./constraints.txt -fi +# generate constraints (possibly pinning to oldest support versions of dependencies) +rapids-generate-pip-constraints py_test_cudf ./constraints.txt # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 3116bd820e9..3f818867d49 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -3,22 +3,6 @@ set -eou pipefail -# We will only fail these tests if the PR touches code in pylibcudf -# or cudf_polars itself. -# Note, the three dots mean we are doing diff between the merge-base -# of upstream and HEAD. So this is asking, "does _this branch_ touch -# files in cudf_polars/pylibcudf", rather than "are there changes -# between upstream and this branch which touch cudf_polars/pylibcudf" -# TODO: is the target branch exposed anywhere in an environment variable? -if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; -then - HAS_CHANGES=1 - rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" -else - HAS_CHANGES=0 - rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" -fi - rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -29,21 +13,15 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Installing cudf_polars and its dependencies" -# Constraint to minimum dependency versions if job is set up as "oldest" -echo "" > ./constraints.txt -if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - rapids-dependency-file-generator \ - --output requirements \ - --file-key py_test_cudf_polars \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ - | tee ./constraints.txt -fi -# echo to expand wildcard before adding `[test]` requires for pip +# generate constraints (possibly pinning to oldest support versions of dependencies) +rapids-generate-pip-constraints py_test_cudf_polars ./constraints.txt + +# echo to expand wildcard before adding `[test,experimental]` requires for pip python -m pip install \ -v \ --constraint ./constraints.txt \ - "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,experimental]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" @@ -71,9 +49,4 @@ if [ ${EXITCODE} != 0 ]; then else rapids-logger "Testing PASSED" fi - -if [ ${HAS_CHANGES} == 1 ]; then - exit ${EXITCODE} -else - exit 0 -fi +exit ${EXITCODE} diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 361a42ccda9..e15949f4bdb 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -12,15 +12,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements" -# Constraint to minimum dependency versions if job is set up as "oldest" -echo "" > ./constraints.txt -if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - rapids-dependency-file-generator \ - --output requirements \ - --file-key py_test_dask_cudf \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ - | tee ./constraints.txt -fi + +# generate constraints (possibly pinning to oldest support versions of dependencies) +rapids-generate-pip-constraints py_test_dask_cudf ./constraints.txt # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh new file mode 100755 index 00000000000..5910a5c59fe --- /dev/null +++ b/ci/validate_wheel.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir=$1 +wheel_dir_relative_path=$2 + +cd "${package_dir}" + +rapids-logger "validate packages with 'pydistcheck'" + +pydistcheck \ + --inspect \ + "$(echo ${wheel_dir_relative_path}/*.whl)" + +rapids-logger "validate packages with 'twine'" + +twine check \ + --strict \ + "$(echo ${wheel_dir_relative_path}/*.whl)" diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 4a4d61e5a21..1ec002d3ec6 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,14 +19,14 @@ dependencies: - cramjam - cubinlinker - cuda-nvtx=11.8 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-sanitizer-api=11.8.86 - cuda-version=11.8 - cudatoolkit - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.10.* +- dask-cuda==24.12.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -42,10 +42,11 @@ dependencies: - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 -- libkvikio==24.10.* +- libkvikio==24.12.*,>=0.0.0a0 - librdkafka>=2.5.0,<2.6.0a0 -- librmm==24.10.* +- librmm==24.12.*,>=0.0.0a0 - make +- mmh3 - moto>=4.0.8 - msgpack-python - myst-nb @@ -54,34 +55,36 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.57 +- numba-cuda>=0.0.13,<0.0.18 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 -- nvcomp==4.0.1 +- nvcomp==4.1.0.6 - nvtx>=0.2.1 - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.8,<1.9 +- polars>=1.11,<1.15 - pre-commit - ptxcompiler -- pyarrow>=14.0.0,<18.0.0a0 +- pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 +- pynvml>=11.4.1,<12.0.0a0 - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov - pytest-xdist - pytest<8 - python-confluent-kafka>=2.5.0,<2.6.0a0 +- python-xxhash - python>=3.10,<3.13 - pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==24.10.* +- rapids-dask-dependency==24.12.*,>=0.0.0a0 - rich -- rmm==24.10.* +- rmm==24.12.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 68d948a4c93..b6d1cf75721 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,13 +21,13 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-sanitizer-api - cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==24.10.* +- dask-cuda==24.12.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -41,10 +41,11 @@ dependencies: - jupyter_client - libcufile-dev - libcurand-dev -- libkvikio==24.10.* +- libkvikio==24.12.*,>=0.0.0a0 - librdkafka>=2.5.0,<2.6.0a0 -- librmm==24.10.* +- librmm==24.12.*,>=0.0.0a0 - make +- mmh3 - moto>=4.0.8 - msgpack-python - myst-nb @@ -53,33 +54,35 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.57 +- numba-cuda>=0.0.13,<0.0.18 - numpy>=1.23,<3.0a0 - numpydoc -- nvcomp==4.0.1 +- nvcomp==4.1.0.6 - nvtx>=0.2.1 - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.8,<1.9 +- polars>=1.11,<1.15 - pre-commit -- pyarrow>=14.0.0,<18.0.0a0 +- pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 -- pynvjitlink +- pynvjitlink>=0.0.0a0 +- pynvml>=11.4.1,<12.0.0a0 - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov - pytest-xdist - pytest<8 - python-confluent-kafka>=2.5.0,<2.6.0a0 +- python-xxhash - python>=3.10,<3.13 - pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==24.10.* +- rapids-dask-dependency==24.12.*,>=0.0.0a0 - rich -- rmm==24.10.* +- rmm==24.12.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy diff --git a/conda/recipes/cudf-polars/build.sh b/conda/recipes/cudf-polars/build.sh new file mode 100644 index 00000000000..06e2f1bcb99 --- /dev/null +++ b/conda/recipes/cudf-polars/build.sh @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +# This assumes the script is executed from the root of the repo directory +./build.sh cudf_polars diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml new file mode 100644 index 00000000000..b6c03dc1bc2 --- /dev/null +++ b/conda/recipes/cudf-polars/meta.yaml @@ -0,0 +1,61 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} +{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} +{% set py_version = environ['CONDA_PY'] %} +{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set cuda_major = cuda_version.split('.')[0] %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} + +package: + name: cudf-polars + version: {{ version }} + +source: + path: ../../.. + +build: + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + script_env: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + - CMAKE_C_COMPILER_LAUNCHER + - CMAKE_CUDA_COMPILER_LAUNCHER + - CMAKE_CXX_COMPILER_LAUNCHER + - CMAKE_GENERATOR + - PARALLEL_LEVEL + - SCCACHE_BUCKET + - SCCACHE_IDLE_TIMEOUT + - SCCACHE_REGION + - SCCACHE_S3_KEY_PREFIX=cudf-polars-aarch64 # [aarch64] + - SCCACHE_S3_KEY_PREFIX=cudf-polars-linux64 # [linux64] + - SCCACHE_S3_USE_SSL + - SCCACHE_S3_NO_CREDENTIALS + +requirements: + host: + - python + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - setuptools + - cuda-version ={{ cuda_version }} + run: + - python + - pylibcudf ={{ version }} + - polars >=1.11,<1.15 + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + +test: + requires: + - cuda-version ={{ cuda_version }} + imports: + - cudf_polars + + +about: + home: https://rapids.ai/ + license: Apache-2.0 + license_family: APACHE + license_file: LICENSE + summary: cudf-polars library diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index e22b4a4eddc..04904e95630 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -78,9 +78,9 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba >=0.57 + - numba-cuda >=0.0.13,<0.0.18 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} @@ -91,7 +91,7 @@ requirements: - cudatoolkit - ptxcompiler >=0.7.0 - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.7.1,<12.0a0,<=11.8.3 {% else %} - cuda-cudart - libcufile # [linux64] @@ -100,7 +100,7 @@ requirements: # TODO: Add nvjitlink here # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.0,<13.0a0,<=12.6.0 - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 1e6c0a35a09..74ecded8ead 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -43,6 +43,7 @@ requirements: run: - python - cudf ={{ version }} + - pynvml >=11.4.1,<12.0.0a0 - rapids-dask-dependency ={{ minor_version }} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index dc75eb4b252..c78ca326005 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -35,7 +35,7 @@ spdlog_version: - ">=1.14.1,<1.15" nvcomp_version: - - "=4.0.1" + - "=4.1.0.6" zlib_version: - ">=1.2.13" diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 7c1efa0176c..ec3fcd59c62 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -77,15 +77,15 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.7.1,<12.0a0,<=11.8.3 {% else %} - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.0,<13.0a0,<=12.6.0 {% endif %} - nvtx >=0.2.1 - packaging diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index d766d98b45e..0e5699876fc 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -1,17 +1,47 @@ --- +# Notes on disabled checks +# ------------------------ +# modernize-use-equals-default: +# auto-fix is broken (doesn't insert =default correctly) +# modernize-concat-nested-namespaces: +# auto-fix is broken (can delete code) +# modernize-use-trailing-return-type: +# Purely stylistic, no benefit to rewriting everything +# modernize-return-braced-init-list: +# Stylistically we prefer to see the return type at the return site. +# See https://github.com/rapidsai/cudf/pull/16956#pullrequestreview-2341891672 +# for more information. +# modernize-use-bool-literals: +# Our tests use int flags for validity masks extensively and we prefer that +# clang-analyzer-cplusplus.NewDeleteLeaks: +# This check has numerous bugs, see +# https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+newdeleteleaks +# We encounter at least +# https://github.com/llvm/llvm-project/issues/60896 +# https://github.com/llvm/llvm-project/issues/69602 +# clang-analyzer-optin.core.EnumCastOutOfRange +# We use enums as flags in multiple cases and this check makes ORing flags invalid +# clang-analyzer-optin.cplusplus.UninitializedObject' +# There is an error in nanoarrow that none of the clang-tidy filters (i.e. +# header-filter and exclude-header-filter are able to properly avoid. This +# merits further investigation +# +# We need to verify that broken checks are still broken Checks: 'modernize-*, -modernize-use-equals-default, -modernize-concat-nested-namespaces, - -modernize-use-trailing-return-type' - - # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) - # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) - # -modernize-use-trailing-return-type # just a preference + -modernize-use-trailing-return-type, + -modernize-return-braced-init-list, + -modernize-use-bool-literals, + clang-analyzer-*, + -clang-analyzer-cplusplus.NewDeleteLeaks, + -clang-analyzer-optin.core.EnumCastOutOfRange, + -clang-analyzer-optin.cplusplus.UninitializedObject' WarningsAsErrors: '' -HeaderFilterRegex: '' -AnalyzeTemporaryDtors: false +HeaderFilterRegex: '.*cudf/cpp/(src|include).*' +ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*' FormatStyle: none CheckOptions: - key: modernize-loop-convert.MaxCopySize diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 136f43ee706..506f6c185f5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -36,6 +36,8 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERS ) endif() +rapids_cmake_write_version_file(include/cudf/version_config.hpp) + # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to # have different values for the `Threads::Threads` target. Setting this flag ensures # `Threads::Threads` is the same value in first run and subsequent runs. @@ -52,6 +54,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) +option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON) option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) option( @@ -76,6 +79,7 @@ option(CUDA_ENABLE_LINEINFO option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON) # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) +option(CUDA_STATIC_CUFILE "Statically link cuFile" OFF) set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON) if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS) @@ -87,6 +91,13 @@ option( ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL} ) mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) +option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF) + +option( + CUDF_KVIKIO_REMOTE_IO + "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO through fsspec." + ON +) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") @@ -107,6 +118,9 @@ message( "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}" ) message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") +message(VERBOSE + "CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}" +) # Set a default build type if none was specified rapids_cmake_build_type("Release") @@ -143,12 +157,91 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR) set(CUDF_GENERATED_INCLUDE_DIR ${CUDF_BINARY_DIR}) endif() +# ################################################################################################## +# * linter configuration --------------------------------------------------------------------------- +if(CUDF_STATIC_LINTERS) + # For simplicity, for now we assume that all linters can be installed into an environment where + # any linter is being run. We could relax this requirement if desired. + find_program( + CLANG_TIDY_EXE + NAMES "clang-tidy" + DOC "Path to clang-tidy executable" REQUIRED + ) + + execute_process( + COMMAND ${CLANG_TIDY_EXE} --version + OUTPUT_VARIABLE CLANG_TIDY_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "LLVM version ([0-9]+\\.[0-9]+)\\.[0-9]+" LLVM_VERSION_MATCH + "${CLANG_TIDY_OUTPUT}" + ) + # Discard the patch version and allow it to float. Empirically, results between patch versions are + # mostly stable, and looking at available packages on some package managers sometimes patch + # versions are skipped so we don't want to constrain to a patch version that the user can't + # install. + set(LLVM_VERSION "${CMAKE_MATCH_1}") + set(expected_clang_tidy_version 19.1) + if(NOT expected_clang_tidy_version VERSION_EQUAL LLVM_VERSION) + message( + FATAL_ERROR + "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}" + ) + endif() + + find_program(IWYU_EXE NAMES include-what-you-use iwyu REQUIRED) +endif() + +# Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES. +function(enable_static_checkers target) + set(_tidy_options IWYU CLANG_TIDY) + set(_tidy_one_value) + set(_tidy_multi_value SKIPPED_FILES) + cmake_parse_arguments( + _LINT "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN} + ) + + if(CUDF_STATIC_LINTERS) + if(_LINT_CLANG_TIDY) + # clang will complain about unused link libraries on the compile line unless we specify + # -Qunused-arguments. + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" + ) + endif() + if(_LINT_IWYU) + # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not + # relevant since they don't show up in any other build so it's better to suppress them until + # we can figure out the cause. Setting this as part of CXX_INCLUDE_WHAT_YOU_USE does not + # appear to be sufficient, we must also ensure that it is set to the underlying target's CXX + # compile flags. To do this completely cleanly we should modify the flags on the target rather + # than the global CUDF_CXX_FLAGS, but this solution is good enough for now since we never run + # the linters on real builds. + foreach(_flag -Wno-missing-braces -Wno-unneeded-internal-declaration) + list(FIND CUDF_CXX_FLAGS "${_flag}" _flag_index) + if(_flag_index EQUAL -1) + list(APPEND CUDF_CXX_FLAGS ${_flag}) + endif() + endforeach() + set(CUDF_CXX_FLAGS + "${CUDF_CXX_FLAGS}" + PARENT_SCOPE + ) + set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}") + endif() + foreach(file IN LISTS _LINT_SKIPPED_FILES) + set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON) + endforeach() + endif() +endfunction() + # ################################################################################################## # * conda environment ----------------------------------------------------------------------------- rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) # ################################################################################################## # * compiler options ------------------------------------------------------------------------------ +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH}) rapids_find_package( CUDAToolkit REQUIRED BUILD_EXPORT_SET cudf-exports @@ -195,8 +288,6 @@ if(CUDF_BUILD_TESTUTIL) endif() # preprocess jitify-able kernels include(cmake/Modules/JitifyPreprocessKernels.cmake) -# find cuFile -include(cmake/thirdparty/get_cufile.cmake) # find KvikIO include(cmake/thirdparty/get_kvikio.cmake) # find fmt @@ -213,9 +304,6 @@ if(NOT BUILD_SHARED_LIBS) include("${rapids-cmake-dir}/export/find_package_file.cmake") list(APPEND METADATA_KINDS BUILD INSTALL) list(APPEND dependencies KvikIO ZLIB nvcomp nanoarrow) - if(TARGET cufile::cuFile_interface) - list(APPEND dependencies cuFile) - endif() foreach(METADATA_KIND IN LISTS METADATA_KINDS) foreach(dep IN LISTS dependencies) @@ -314,7 +402,19 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/compute_aggregations.cu + src/groupby/hash/compute_aggregations_null.cu + src/groupby/hash/compute_global_memory_aggs.cu + src/groupby/hash/compute_global_memory_aggs_null.cu + src/groupby/hash/compute_groupby.cu + src/groupby/hash/compute_mapping_indices.cu + src/groupby/hash/compute_mapping_indices_null.cu + src/groupby/hash/compute_shared_memory_aggs.cu + src/groupby/hash/create_sparse_results_table.cu + src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu + src/groupby/hash/hash_compound_agg_finalizer.cu + src/groupby/hash/sparse_to_dense_results.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu @@ -364,6 +464,7 @@ add_library( src/io/avro/avro_gpu.cu src/io/avro/reader_impl.cu src/io/comp/brotli_dict.cpp + src/io/comp/comp.cpp src/io/comp/cpu_unbz2.cpp src/io/comp/debrotli.cu src/io/comp/gpuinflate.cu @@ -604,6 +705,7 @@ add_library( src/strings/replace/replace_slice.cu src/strings/reverse.cu src/strings/scan/scan_inclusive.cu + src/strings/search/contains_multiple.cu src/strings/search/findall.cu src/strings/search/find.cu src/strings/search/find_multiple.cu @@ -708,6 +810,11 @@ set_target_properties( INTERFACE_POSITION_INDEPENDENT_CODE ON ) +# Note: This must come before the target_compile_options below so that the function can modify the +# flags if necessary. +enable_static_checkers( + cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp CLANG_TIDY IWYU +) target_compile_options( cudf PRIVATE "$<$:${CUDF_CXX_FLAGS}>" "$<$:${CUDF_CUDA_FLAGS}>" @@ -793,6 +900,18 @@ target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL # Define spdlog level target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Enable remote IO through KvikIO +target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) + +# Enable cuFile support +set(_cufile_suffix) +if(CUDA_STATIC_CUFILE) + set(_cufile_suffix _static) +endif() +if(TARGET CUDA::cuFile${_cufile_suffix}) + target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND) +endif() + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) @@ -801,7 +920,7 @@ target_link_libraries( cudf PUBLIC CCCL::CCCL rmm::rmm $ spdlog::spdlog_header_only PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp - kvikio::kvikio $ nanoarrow + kvikio::kvikio $ nanoarrow ) # Add Conda library, and include paths if specified @@ -861,15 +980,7 @@ if(CUDF_BUILD_TESTUTIL) add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream) - add_library( - cudftestutil SHARED - tests/io/metadata_utilities.cpp - tests/utilities/column_utilities.cu - tests/utilities/debug_utilities.cu - tests/utilities/random_seed.cpp - tests/utilities/table_utilities.cu - tests/utilities/tdigest_utilities.cu - ) + add_library(cudftestutil INTERFACE) set_target_properties( cudftestutil @@ -878,32 +989,56 @@ if(CUDF_BUILD_TESTUTIL) # set target compile options CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON - CXX_VISIBILITY_PRESET hidden CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON - CUDA_VISIBILITY_PRESET hidden - POSITION_INDEPENDENT_CODE ON - INTERFACE_POSITION_INDEPENDENT_CODE ON ) target_compile_options( - cudftestutil PUBLIC "$:${CUDF_CXX_FLAGS}>>" - "$:${CUDF_CUDA_FLAGS}>>" + cudftestutil INTERFACE "$:${CUDF_CXX_FLAGS}>>" + "$:${CUDF_CUDA_FLAGS}>>" ) target_link_libraries( - cudftestutil - PUBLIC Threads::Threads cudf cudftest_default_stream - PRIVATE GTest::gmock GTest::gtest $ + cudftestutil INTERFACE Threads::Threads cudf cudftest_default_stream + $ ) target_include_directories( - cudftestutil PUBLIC "$" - "$" + cudftestutil INTERFACE "$" + "$" ) rapids_cuda_set_runtime(cudftestutil USE_STATIC ${CUDA_STATIC_RUNTIME}) add_library(cudf::cudftestutil ALIAS cudftestutil) + add_library(cudftestutil_impl INTERFACE) + add_library(cudf::cudftestutil_impl ALIAS cudftestutil_impl) + target_sources( + cudftestutil_impl + INTERFACE $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + ) + target_link_libraries(cudftestutil_impl INTERFACE cudf::cudftestutil) + + install(FILES tests/io/metadata_utilities.cpp DESTINATION src/cudftestutil/io) + install( + FILES tests/utilities/column_utilities.cu + tests/utilities/debug_utilities.cu + tests/utilities/random_seed.cpp + tests/utilities/table_utilities.cu + tests/utilities/tdigest_utilities.cu + DESTINATION src/cudftestutil/utilities + ) + endif() # * build cudf_identify_stream_usage -------------------------------------------------------------- @@ -1000,11 +1135,14 @@ install( DESTINATION ${lib_dir} EXPORT cudf-exports ) +install(FILES ${CUDF_BINARY_DIR}/include/cudf/version_config.hpp + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudf +) set(_components_export_string) if(TARGET cudftestutil) install( - TARGETS cudftest_default_stream cudftestutil + TARGETS cudftest_default_stream cudftestutil cudftestutil_impl DESTINATION ${lib_dir} EXPORT cudf-testing-exports ) @@ -1044,14 +1182,15 @@ targets: This module offers an optional testing component which defines the following IMPORTED GLOBAL targets: - cudf::cudftestutil - The main cudf testing library + cudf::cudftestutil - The main cudf testing library + cudf::cudftestutil_impl - C++ and CUDA sources to compile for definitions in cudf::cudftestutil ]=] ) rapids_export( INSTALL cudf EXPORT_SET cudf-exports ${_components_export_string} - GLOBAL_TARGETS cudf cudftestutil + GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl NAMESPACE cudf:: DOCUMENTATION doc_string ) @@ -1072,7 +1211,7 @@ endif() rapids_export( BUILD cudf EXPORT_SET cudf-exports ${_components_export_string} - GLOBAL_TARGETS cudf cudftestutil + GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl NAMESPACE cudf:: DOCUMENTATION doc_string FINAL_CODE_BLOCK build_code_string diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..d3de9b39977 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -25,7 +25,7 @@ target_compile_options( target_link_libraries( cudf_datagen PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf - cudftestutil nvtx3::nvtx3-cpp + cudf::cudftestutil nvtx3::nvtx3-cpp PRIVATE $ ) @@ -49,7 +49,7 @@ target_compile_options( target_link_libraries( ndsh_data_generator - PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp + PUBLIC cudf cudf::cudftestutil nvtx3::nvtx3-cpp PRIVATE $ ) @@ -65,14 +65,14 @@ target_include_directories( # Use an OBJECT library so we only compile these helper source files only once add_library( cudf_benchmark_common OBJECT - "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" - synchronization/synchronization.cpp - io/cuio_common.cpp - common/table_utilities.cpp - common/benchmark_utilities.cpp - common/nvbench_utilities.cpp + synchronization/synchronization.cpp io/cuio_common.cpp common/table_utilities.cpp + common/benchmark_utilities.cpp common/nvbench_utilities.cpp ) -target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $) +target_link_libraries( + cudf_benchmark_common PRIVATE cudf_datagen $ GTest::gmock + GTest::gtest +) + add_custom_command( OUTPUT CUDF_BENCHMARKS COMMAND echo Running benchmarks @@ -99,7 +99,7 @@ function(ConfigureBench CMAKE_BENCH_NAME) ) target_link_libraries( ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main - $ + cudf::cudftestutil_impl $ ) add_custom_command( OUTPUT CUDF_BENCHMARKS @@ -127,8 +127,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) INSTALL_RPATH "\$ORIGIN/../../../lib" ) target_link_libraries( - ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen - nvbench::nvbench $ + ${CMAKE_BENCH_NAME} + PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen nvbench::nvbench + $ cudf::cudftestutil_impl ) install( TARGETS ${CMAKE_BENCH_NAME} @@ -245,6 +246,7 @@ ConfigureNVBench( REDUCTION_NVBENCH reduction/anyall.cpp reduction/dictionary.cpp + reduction/histogram.cpp reduction/minmax.cpp reduction/rank.cpp reduction/reduce.cpp @@ -270,8 +272,13 @@ ConfigureBench( ) ConfigureNVBench( - GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp - groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp + GROUPBY_NVBENCH + groupby/group_histogram.cpp + groupby/group_max.cpp + groupby/group_max_multithreaded.cpp + groupby/group_nunique.cpp + groupby/group_rank.cpp + groupby/group_struct_keys.cpp ) # ################################################################################################## @@ -279,6 +286,12 @@ ConfigureNVBench( ConfigureBench(HASHING_BENCH hashing/partition.cpp) ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp) +# ################################################################################################## +# * interop benchmark ------------------------------------------------------------------------------ +ConfigureNVBench(INTEROP_NVBENCH interop/interop.cpp) +target_link_libraries(INTEROP_NVBENCH PRIVATE nanoarrow) +target_include_directories(INTEROP_NVBENCH PRIVATE ${CMAKE_SOURCE_DIR}/tests/interop) + # ################################################################################################## # * merge benchmark ------------------------------------------------------------------------------- ConfigureBench(MERGE_BENCH merge/merge.cpp) @@ -330,58 +343,56 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_NVBENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) +ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp + text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- -ConfigureBench( - STRINGS_BENCH - string/combine.cpp - string/convert_datetime.cpp - string/convert_durations.cpp - string/convert_fixed_point.cpp - string/convert_numerics.cpp - string/copy.cu - string/factory.cu - string/filter.cpp - string/repeat_strings.cpp - string/replace.cpp - string/translate.cpp - string/url_decode.cu -) +ConfigureBench(STRINGS_BENCH string/factory.cu) ConfigureNVBench( STRINGS_NVBENCH string/case.cpp string/char_types.cpp + string/combine.cpp string/contains.cpp + string/convert_datetime.cpp + string/convert_durations.cpp + string/convert_fixed_point.cpp + string/convert_numerics.cpp + string/copy.cpp string/copy_if_else.cpp string/copy_range.cpp string/count.cpp string/extract.cpp + string/filter.cpp string/find.cpp - string/gather.cpp + string/find_multiple.cpp string/join_strings.cpp string/lengths.cpp string/like.cpp + string/make_strings_column.cu + string/repeat_strings.cpp + string/replace.cpp string/replace_re.cpp string/reverse.cpp string/slice.cpp string/split.cpp string/split_re.cpp + string/translate.cpp + string/url_decode.cu ) # ################################################################################################## @@ -392,11 +403,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp) ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) -# ################################################################################################## -# * multi buffer memset benchmark -# ---------------------------------------------------------------------- -ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp) - # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 65a44532cf1..2533ea9611c 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,30 @@ */ #include -#include -#include +#include + +#include +#include +#include +#include #include #include +#include +#include + +#include #include +#include +#include + #include +#include +#include +#include +#include #include #include #include @@ -35,19 +50,16 @@ enum class TreeType { }; template -class AST : public cudf::benchmark {}; - -template -static void BM_ast_transform(benchmark::State& state) +static void BM_ast_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; auto const source_table = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, n_cols), - row_count{table_size}, + row_count{num_rows}, Nullable ? std::optional{0.5} : std::nullopt); auto table = source_table->view(); @@ -86,38 +98,86 @@ static void BM_ast_transform(benchmark::State& state) auto const& expression_tree_root = expressions.back(); - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::compute_column(table, expression_tree_root); - } - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); + state.add_global_memory_reads(static_cast(num_rows) * (tree_levels + 1)); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); } -static void CustomRanges(benchmark::internal::Benchmark* b) +template +static void BM_string_compare_ast_transform(nvbench::state& state) { - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } + auto const string_width = static_cast(state.get_int64("string_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + + CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons"); + + // Create table data + auto const num_cols = tree_levels * 2; + std::vector> columns; + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) { + columns.emplace_back(create_string_column(num_rows, string_width, hit_rate)); + }); + + cudf::table table{std::move(columns)}; + cudf::table_view const table_view = table.view(); + + int64_t const chars_size = std::accumulate( + table_view.begin(), + table_view.end(), + static_cast(0), + [](int64_t size, auto& column) -> int64_t { + return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream()); + }); + + // Create column references + auto column_refs = std::vector(); + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_cols), + std::back_inserter(column_refs), + [](auto const& column_id) { return cudf::ast::column_reference(column_id); }); + + // Create expression trees + std::list expressions; + + // Construct AST tree (a == b && c == d && e == f && ...) + + expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1])); + + std::for_each(thrust::make_counting_iterator(1), + thrust::make_counting_iterator(tree_levels), + [&](size_t idx) { + auto const& lhs = expressions.back(); + auto const& rhs = expressions.emplace_back( + cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1])); + expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs)); + }); + + auto const& expression_tree_root = expressions.back(); + + // Use the number of bytes read from global memory + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) \ + static void name(::nvbench::state& st) \ { \ - BM_ast_transform(st); \ + ::BM_ast_transform(st); \ } \ - BENCHMARK_REGISTER_F(AST, name) \ - ->Apply(CustomRanges) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("tree_levels", {1, 5, 10}) \ + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); @@ -132,3 +192,19 @@ AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true); AST_TRANSFORM_BENCHMARK_DEFINE( ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true); + +#define AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_string_compare_ast_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("string_width", {32, 64, 128, 256}) \ + .add_int64_axis("num_rows", {32768, 262144, 2097152}) \ + .add_int64_axis("tree_levels", {1, 2, 3, 4}) \ + .add_int64_axis("hit_rate", {50, 100}) + +AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and, + cudf::ast::ast_operator::EQUAL, + cudf::ast::ast_operator::LOGICAL_AND); diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp index fa98d9e601a..75c91d270a7 100644 --- a/cpp/benchmarks/binaryop/binaryop.cpp +++ b/cpp/benchmarks/binaryop/binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,20 @@ */ #include -#include -#include #include +#include +#include #include #include +#include + +#include + #include -#include +#include +#include // This set of benchmarks is designed to be a comparison for the AST benchmarks @@ -33,23 +38,22 @@ enum class TreeType { }; template -class BINARYOP : public cudf::benchmark {}; - -template -static void BM_binaryop_transform(benchmark::State& state) +static void BM_binaryop_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const num_rows{static_cast(state.get_int64("num_rows"))}; + auto const tree_levels{static_cast(state.get_int64("tree_levels"))}; // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; auto const source_table = create_sequence_table( - cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{table_size}); + cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{num_rows}); cudf::table_view table{*source_table}; - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 + // Use the number of bytes read from global memory + state.add_global_memory_reads(static_cast(num_rows) * (tree_levels + 1)); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // Execute tree that chains additions like (((a + b) + c) + d) auto const op = cudf::binary_operator::ADD; auto const result_data_type = cudf::data_type(cudf::type_to_id()); @@ -64,16 +68,72 @@ static void BM_binaryop_transform(benchmark::State& state) result = cudf::binary_operation(result->view(), col, op, result_data_type); }); } - } + }); +} + +template +static void BM_string_compare_binaryop_transform(nvbench::state& state) +{ + auto const string_width = static_cast(state.get_int64("string_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + + CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons"); + + // Create table data + auto const num_cols = tree_levels * 2; + std::vector> columns; + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) { + columns.emplace_back(create_string_column(num_rows, string_width, hit_rate)); + }); + + cudf::table table{std::move(columns)}; + cudf::table_view const table_view = table.view(); + + int64_t const chars_size = std::accumulate( + table_view.begin(), table_view.end(), static_cast(0), [](int64_t size, auto& column) { + return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream()); + }); + + // Create column references // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); + + // Construct binary operations (a == b && c == d && e == f && ...) + auto constexpr bool_type = cudf::data_type{cudf::type_id::BOOL8}; + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream{launch.get_stream().get_stream()}; + std::unique_ptr reduction = + cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream); + std::for_each( + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(tree_levels), + [&](size_t idx) { + std::unique_ptr comparison = cudf::binary_operation( + table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream); + std::unique_ptr reduced = + cudf::binary_operation(*comparison, *reduction, reduce_op, bool_type, stream); + stream.synchronize(); + reduction = std::move(reduced); + }); + }); } #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ - BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \ - (::benchmark::State & st) { BM_binaryop_transform(st); } + \ + static void name(::nvbench::state& st) \ + { \ + ::BM_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .add_int64_axis("tree_levels", {1, 2, 5, 10}) \ + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique, int32_t, @@ -88,28 +148,19 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique, TreeType::IMBALANCED_LEFT, false); -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 2, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } -} - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); +#define STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \ + \ + static void name(::nvbench::state& st) \ + { \ + ::BM_string_compare_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("string_width", {32, 64, 128, 256}) \ + .add_int64_axis("num_rows", {32768, 262144, 2097152}) \ + .add_int64_axis("tree_levels", {1, 2, 3, 4}) \ + .add_int64_axis("hit_rate", {50, 100}) + +STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform, + cudf::binary_operator::EQUAL, + cudf::binary_operator::LOGICAL_AND); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 7086a61c7c5..426f44a4fa1 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -15,20 +15,18 @@ */ #include -#include -#include #include -class COMPILED_BINARYOP : public cudf::benchmark {}; +#include template -void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) { - auto const column_size{static_cast(state.range(0))}; + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const source_table = create_random_table( - {cudf::type_to_id(), cudf::type_to_id()}, row_count{column_size}); + {cudf::type_to_id(), cudf::type_to_id()}, row_count{num_rows}); auto lhs = cudf::column_view(source_table->get_column(0)); auto rhs = cudf::column_view(source_table->get_column(1)); @@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) // Call once for hot cache. cudf::binary_operation(lhs, rhs, binop, output_dtype); - for (auto _ : state) { - cuda_event_timer timer(state, true); - cudf::binary_operation(lhs, rhs, binop, output_dtype); - } - // use number of bytes read and written to global memory - state.SetBytesProcessed(static_cast(state.iterations()) * column_size * - (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut))); + state.add_global_memory_reads(num_rows); + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); }); } +#define BM_STRINGIFY(a) #a + // TODO tparam boolean for null. -#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ - BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ - (::benchmark::State & st) \ - { \ - BM_compiled_binaryop(st, cudf::binary_operator::bop); \ - } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ - ->Arg(100000000); /* 100M */ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_compiled_binaryop(st, ::cudf::binary_operator::bop); \ + } \ + NVBENCH_BENCH(name) \ + .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \ + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) #define build_name(a, b, c, d) a##_##b##_##c##_##d diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index dc258e32dc5..8bce718c7d8 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -17,13 +17,19 @@ #include "generate_input.hpp" #include "random_distribution_factory.cuh" +#include + #include #include +#include #include +#include #include #include #include #include +#include +#include #include #include #include @@ -536,7 +542,7 @@ struct string_generator { // range 32-127 is ASCII; 127-136 will be multi-byte UTF-8 { } - __device__ void operator()(thrust::tuple str_begin_end) + __device__ void operator()(thrust::tuple str_begin_end) { auto begin = thrust::get<0>(str_begin_end); auto end = thrust::get<1>(str_begin_end); @@ -565,6 +571,9 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons distribution_params{1. - profile.get_null_probability().value_or(0)}); auto lengths = len_dist(engine, num_rows + 1); auto null_mask = valid_dist(engine, num_rows + 1); + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + thrust::transform_if( thrust::device, lengths.begin(), @@ -576,28 +585,26 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons auto valid_lengths = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())), valid_or_zero{}); - rmm::device_uvector offsets(num_rows + 1, cudf::get_default_stream()); - thrust::exclusive_scan( - thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin()); - // offsets are ready. - auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1); + + // offsets are created as INT32 or INT64 as appropriate + auto [offsets, chars_length] = cudf::strings::detail::make_offsets_child_column( + valid_lengths, valid_lengths + num_rows, stream, mr); + // use the offsetalator to normalize the offset values for use by the string_generator + auto offsets_itr = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); rmm::device_uvector chars(chars_length, cudf::get_default_stream()); thrust::for_each_n(thrust::device, - thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1), + thrust::make_zip_iterator(offsets_itr, offsets_itr + 1), num_rows, string_generator{chars.data(), engine}); + auto [result_bitmask, null_count] = - cudf::detail::valid_if(null_mask.begin(), - null_mask.end() - 1, - thrust::identity{}, - cudf::get_default_stream(), - cudf::get_current_device_resource_ref()); + profile.get_null_probability().has_value() + ? cudf::detail::valid_if( + null_mask.begin(), null_mask.end() - 1, thrust::identity{}, stream, mr) + : std::pair{rmm::device_buffer{}, 0}; + return cudf::make_strings_column( - num_rows, - std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - chars.release(), - null_count, - profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); + num_rows, std::move(offsets), chars.release(), null_count, std::move(result_bitmask)); } /** @@ -918,6 +925,58 @@ std::unique_ptr create_sequence_table(std::vector co return std::make_unique(std::move(columns)); } +std::unique_ptr create_string_column(cudf::size_type num_rows, + cudf::size_type row_width, + int32_t hit_rate) +{ + // build input table using the following data + auto raw_data = cudf::test::strings_column_wrapper( + { + "123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns; + "012345 6789 01234 56789 0123 456", // the rest do not match + "abc 4567890 DEFGHI 0987 Wxyz 123", + "abcdefghijklmnopqrstuvwxyz 01234", + "", + "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", + "9876543210,abcdefghijklmnopqrstU", + "9876543210,abcdefghijklmnopqrstU", + "123 édf 4567890 DéFG 0987 X5", + "1", + }) + .release(); + + if (row_width / 32 > 1) { + std::vector columns; + for (int i = 0; i < row_width / 32; ++i) { + columns.push_back(raw_data->view()); + } + raw_data = cudf::strings::concatenate(cudf::table_view(columns)); + } + auto data_view = raw_data->view(); + + // compute number of rows in n_rows that should match + auto const num_matches = (static_cast(num_rows) * hit_rate) / 100; + + // Create a randomized gather-map to build a column out of the strings in data. + data_profile gather_profile = + data_profile_builder().cardinality(0).null_probability(0.0).distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); + auto gather_table = + create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile); + gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); + + // Create scatter map by placing 0-index values throughout the gather-map + auto scatter_data = cudf::sequence(num_matches, + cudf::numeric_scalar(0), + cudf::numeric_scalar(num_rows / num_matches)); + auto zero_scalar = cudf::numeric_scalar(0); + auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); + auto gather_map = table->view().column(0); + table = cudf::gather(cudf::table_view({data_view}), gather_map); + + return std::move(table->release().front()); +} + std::pair create_random_null_mask( cudf::size_type size, std::optional null_probability, unsigned seed) { diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 68d3dc492f5..57834fd11d2 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -670,6 +670,18 @@ std::unique_ptr create_random_column(cudf::type_id dtype_id, data_profile const& data_params = data_profile{}, unsigned seed = 1); +/** + * @brief Deterministically generates a large string column filled with data with the given + * parameters. + * + * @param num_rows Number of rows in the output column + * @param row_width Width of each string in the column + * @param hit_rate The hit rate percentage, ranging from 0 to 100 + */ +std::unique_ptr create_string_column(cudf::size_type num_rows, + cudf::size_type row_width, + int32_t hit_rate); + /** * @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in * subsequent rows. diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp new file mode 100644 index 00000000000..cd7f9f298af --- /dev/null +++ b/cpp/benchmarks/groupby/group_histogram.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +template +void groupby_histogram_helper(nvbench::state& state, + cudf::size_type num_rows, + cudf::size_type cardinality, + double null_probability) +{ + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto const values = [&] { + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }(); + + // Vector of 1 request + std::vector requests(1); + requests.back().values = values->view(); + requests.back().aggregations.push_back( + cudf::make_histogram_aggregation()); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys->view()})); + auto const result = gb_obj.aggregate(requests); + }); + + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time, "rows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_groupby_histogram(nvbench::state& state, nvbench::type_list) +{ + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + + if (cardinality > num_rows) { + state.skip("cardinality > num_rows"); + return; + } + + groupby_histogram_helper(state, num_rows, cardinality, null_probability); +} + +NVBENCH_BENCH_TYPES(bench_groupby_histogram, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("groupby_histogram") + .add_float64_axis("null_probability", {0, 0.1, 0.9}) + .add_int64_axis("cardinality", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("num_rows", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/benchmarks/interop/interop.cpp b/cpp/benchmarks/interop/interop.cpp new file mode 100644 index 00000000000..dad7e6f429e --- /dev/null +++ b/cpp/benchmarks/interop/interop.cpp @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include + +template +void BM_to_arrow_device(nvbench::state& state, nvbench::type_list>) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_columns = static_cast(state.get_int64("num_columns")); + auto const num_elements = static_cast(num_rows) * num_columns; + + std::vector types(num_columns, data_type); + + auto const table = create_random_table(types, row_count{num_rows}); + int64_t const size_bytes = estimate_size(table->view()); + + state.add_element_count(num_elements, "num_elements"); + state.add_global_memory_reads(size_bytes); + state.add_global_memory_writes(size_bytes); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::to_arrow_device(table->view(), rmm::cuda_stream_view{launch.get_stream()}); + }); +} + +template +void BM_to_arrow_host(nvbench::state& state, nvbench::type_list>) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_columns = static_cast(state.get_int64("num_columns")); + auto const num_elements = static_cast(num_rows) * num_columns; + + std::vector types(num_columns, data_type); + + auto const table = create_random_table(types, row_count{num_rows}); + int64_t const size_bytes = estimate_size(table->view()); + + state.add_element_count(num_elements, "num_elements"); + state.add_global_memory_reads(size_bytes); + state.add_global_memory_writes(size_bytes); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::to_arrow_host(table->view(), rmm::cuda_stream_view{launch.get_stream()}); + }); +} + +template +void BM_from_arrow_device(nvbench::state& state, nvbench::type_list>) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_columns = static_cast(state.get_int64("num_columns")); + auto const num_elements = static_cast(num_rows) * num_columns; + + std::vector types(num_columns, data_type); + + data_profile profile; + profile.set_struct_depth(1); + profile.set_list_depth(1); + + auto const table = create_random_table(types, row_count{num_rows}, profile); + cudf::table_view table_view = table->view(); + int64_t const size_bytes = estimate_size(table_view); + + std::vector table_metadata; + + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_columns), + std::back_inserter(table_metadata), + [&](auto const column) { + cudf::column_metadata column_metadata{""}; + column_metadata.children_meta = std::vector( + table->get_column(column).num_children(), cudf::column_metadata{""}); + return column_metadata; + }); + + cudf::unique_schema_t schema = cudf::to_arrow_schema(table_view, table_metadata); + cudf::unique_device_array_t input = cudf::to_arrow_device(table_view); + + state.add_element_count(num_elements, "num_elements"); + state.add_global_memory_reads(size_bytes); + state.add_global_memory_writes(size_bytes); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::from_arrow_device_column( + schema.get(), input.get(), rmm::cuda_stream_view{launch.get_stream()}); + }); +} + +template +void BM_from_arrow_host(nvbench::state& state, nvbench::type_list>) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_columns = static_cast(state.get_int64("num_columns")); + auto const num_elements = static_cast(num_rows) * num_columns; + + std::vector types(num_columns, data_type); + + data_profile profile; + profile.set_struct_depth(1); + profile.set_list_depth(1); + + auto const table = create_random_table(types, row_count{num_rows}, profile); + cudf::table_view table_view = table->view(); + int64_t const size_bytes = estimate_size(table_view); + + std::vector table_metadata; + + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_columns), + std::back_inserter(table_metadata), + [&](auto const column) { + cudf::column_metadata column_metadata{""}; + column_metadata.children_meta = std::vector( + table->get_column(column).num_children(), cudf::column_metadata{""}); + return column_metadata; + }); + + cudf::unique_schema_t schema = cudf::to_arrow_schema(table_view, table_metadata); + cudf::unique_device_array_t input = cudf::to_arrow_host(table_view); + + state.add_element_count(num_elements, "num_elements"); + state.add_global_memory_reads(size_bytes); + state.add_global_memory_writes(size_bytes); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::from_arrow_host_column( + schema.get(), input.get(), rmm::cuda_stream_view{launch.get_stream()}); + }); +} + +using data_types = nvbench::enum_type_list; + +static char const* stringify_type(cudf::type_id value) +{ + switch (value) { + case cudf::type_id::INT8: return "INT8"; + case cudf::type_id::INT16: return "INT16"; + case cudf::type_id::INT32: return "INT32"; + case cudf::type_id::INT64: return "INT64"; + case cudf::type_id::UINT8: return "UINT8"; + case cudf::type_id::UINT16: return "UINT16"; + case cudf::type_id::UINT32: return "UINT32"; + case cudf::type_id::UINT64: return "UINT64"; + case cudf::type_id::FLOAT32: return "FLOAT32"; + case cudf::type_id::FLOAT64: return "FLOAT64"; + case cudf::type_id::BOOL8: return "BOOL8"; + case cudf::type_id::TIMESTAMP_DAYS: return "TIMESTAMP_DAYS"; + case cudf::type_id::TIMESTAMP_SECONDS: return "TIMESTAMP_SECONDS"; + case cudf::type_id::TIMESTAMP_MILLISECONDS: return "TIMESTAMP_MILLISECONDS"; + case cudf::type_id::TIMESTAMP_MICROSECONDS: return "TIMESTAMP_MICROSECONDS"; + case cudf::type_id::TIMESTAMP_NANOSECONDS: return "TIMESTAMP_NANOSECONDS"; + case cudf::type_id::DURATION_DAYS: return "DURATION_DAYS"; + case cudf::type_id::DURATION_SECONDS: return "DURATION_SECONDS"; + case cudf::type_id::DURATION_MILLISECONDS: return "DURATION_MILLISECONDS"; + case cudf::type_id::DURATION_MICROSECONDS: return "DURATION_MICROSECONDS"; + case cudf::type_id::DURATION_NANOSECONDS: return "DURATION_NANOSECONDS"; + case cudf::type_id::DICTIONARY32: return "DICTIONARY32"; + case cudf::type_id::STRING: return "STRING"; + case cudf::type_id::LIST: return "LIST"; + case cudf::type_id::DECIMAL32: return "DECIMAL32"; + case cudf::type_id::DECIMAL64: return "DECIMAL64"; + case cudf::type_id::DECIMAL128: return "DECIMAL128"; + case cudf::type_id::STRUCT: return "STRUCT"; + default: return "unknown"; + } +} + +NVBENCH_DECLARE_ENUM_TYPE_STRINGS(cudf::type_id, stringify_type, stringify_type) + +NVBENCH_BENCH_TYPES(BM_to_arrow_host, NVBENCH_TYPE_AXES(data_types)) + .set_type_axes_names({"data_type"}) + .set_name("to_arrow_host") + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("num_columns", {1}); + +NVBENCH_BENCH_TYPES(BM_to_arrow_device, NVBENCH_TYPE_AXES(data_types)) + .set_type_axes_names({"data_type"}) + .set_name("to_arrow_device") + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("num_columns", {1}); + +NVBENCH_BENCH_TYPES(BM_from_arrow_host, NVBENCH_TYPE_AXES(data_types)) + .set_type_axes_names({"data_type"}) + .set_name("from_arrow_host") + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("num_columns", {1}); + +NVBENCH_BENCH_TYPES(BM_from_arrow_device, NVBENCH_TYPE_AXES(data_types)) + .set_type_axes_names({"data_type"}) + .set_name("from_arrow_device") + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("num_columns", {1}); diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index fe24fb58728..45b46005c47 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -186,7 +186,7 @@ std::string exec_cmd(std::string_view cmd) std::fflush(nullptr); // Switch stderr and stdout to only capture stderr auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null"); - std::unique_ptr pipe(popen(redirected_cmd.c_str(), "r"), pclose); + std::unique_ptr pipe(popen(redirected_cmd.c_str(), "r"), pclose); CUDF_EXPECTS(pipe != nullptr, "popen() failed"); std::array buffer; diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp index 4366790f208..678f2f1a600 100644 --- a/cpp/benchmarks/io/json/json_reader_input.cpp +++ b/cpp/benchmarks/io/json/json_reader_input.cpp @@ -24,17 +24,19 @@ #include -// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to -// run on most GPUs, but large enough to allow highest throughput -constexpr size_t data_size = 512 << 20; +// Default size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks +// to run on most GPUs, but large enough to allow highest throughput +constexpr size_t default_data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; void json_read_common(cuio_source_sink_pair& source_sink, cudf::size_type num_rows_to_read, - nvbench::state& state) + nvbench::state& state, + cudf::io::compression_type comptype = cudf::io::compression_type::NONE, + size_t data_size = default_data_size) { cudf::io::json_reader_options read_opts = - cudf::io::json_reader_options::builder(source_sink.make_source_info()); + cudf::io::json_reader_options::builder(source_sink.make_source_info()).compression(comptype); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -57,15 +59,21 @@ void json_read_common(cuio_source_sink_pair& source_sink, state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } -cudf::size_type json_write_bm_data(cudf::io::sink_info sink, - std::vector const& dtypes) +cudf::size_type json_write_bm_data( + cudf::io::sink_info sink, + std::vector const& dtypes, + cudf::io::compression_type comptype = cudf::io::compression_type::NONE, + size_t data_size = default_data_size) { auto const tbl = create_random_table( cycle_dtypes(dtypes, num_cols), table_size_bytes{data_size}, data_profile_builder()); auto const view = tbl->view(); cudf::io::json_writer_options const write_opts = - cudf::io::json_writer_options::builder(sink, view).na_rep("null").rows_per_chunk(100'000); + cudf::io::json_writer_options::builder(sink, view) + .na_rep("null") + .rows_per_chunk(100'000) + .compression(comptype); cudf::io::write_json(write_opts); return view.num_rows(); } @@ -87,6 +95,26 @@ void BM_json_read_io(nvbench::state& state, nvbench::type_list +void BM_json_read_compressed_io( + nvbench::state& state, nvbench::type_list, nvbench::enum_type>) +{ + size_t const data_size = state.get_int64("data_size"); + cuio_source_sink_pair source_sink(IO); + auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), + static_cast(data_type::FLOAT), + static_cast(data_type::DECIMAL), + static_cast(data_type::TIMESTAMP), + static_cast(data_type::DURATION), + static_cast(data_type::STRING), + static_cast(data_type::LIST), + static_cast(data_type::STRUCT)}); + auto const num_rows = + json_write_bm_data(source_sink.make_sink_info(), d_type, comptype, data_size); + + json_read_common(source_sink, num_rows, state, comptype, data_size); +} + template void BM_json_read_data_type( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) @@ -110,8 +138,9 @@ using d_type_list = nvbench::enum_type_list; -using compression_list = - nvbench::enum_type_list; +using compression_list = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_json_read_data_type, NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) @@ -123,3 +152,10 @@ NVBENCH_BENCH_TYPES(BM_json_read_io, NVBENCH_TYPE_AXES(io_list)) .set_name("json_read_io") .set_type_axes_names({"io"}) .set_min_samples(4); + +NVBENCH_BENCH_TYPES(BM_json_read_compressed_io, + NVBENCH_TYPE_AXES(compression_list, nvbench::enum_type_list)) + .set_name("json_read_compressed_io") + .set_type_axes_names({"compression_type", "io"}) + .add_int64_power_of_two_axis("data_size", nvbench::range(20, 29, 1)) + .set_min_samples(4); diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp index 1e3ab2b7b4f..011b2590c6f 100644 --- a/cpp/benchmarks/io/nvbench_helpers.hpp +++ b/cpp/benchmarks/io/nvbench_helpers.hpp @@ -28,6 +28,7 @@ enum class data_type : int32_t { INTEGRAL = static_cast(type_group_id::INTEGRAL), INTEGRAL_SIGNED = static_cast(type_group_id::INTEGRAL_SIGNED), FLOAT = static_cast(type_group_id::FLOATING_POINT), + BOOL8 = static_cast(cudf::type_id::BOOL8), DECIMAL = static_cast(type_group_id::FIXED_POINT), TIMESTAMP = static_cast(type_group_id::TIMESTAMP), DURATION = static_cast(type_group_id::DURATION), @@ -44,6 +45,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( case data_type::INTEGRAL: return "INTEGRAL"; case data_type::INTEGRAL_SIGNED: return "INTEGRAL_SIGNED"; case data_type::FLOAT: return "FLOAT"; + case data_type::BOOL8: return "BOOL8"; case data_type::DECIMAL: return "DECIMAL"; case data_type::TIMESTAMP: return "TIMESTAMP"; case data_type::DURATION: return "DURATION"; @@ -74,6 +76,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( [](auto value) { switch (value) { case cudf::io::compression_type::SNAPPY: return "SNAPPY"; + case cudf::io::compression_type::GZIP: return "GZIP"; case cudf::io::compression_type::NONE: return "NONE"; default: return "Unknown"; } diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index ce115fd7723..b14f9cbb67e 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -114,6 +114,7 @@ void BM_parquet_read_io_compression(nvbench::state& state) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -298,6 +299,7 @@ void BM_parquet_read_wide_tables_mixed(nvbench::state& state) using d_type_list = nvbench::enum_type_list const& d_types, std::string const& label) { - size_t const data_size = state.get_int64("total_data_size"); - auto const num_threads = state.get_int64("num_threads"); - auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); + size_t const data_size = state.get_int64("total_data_size"); + auto const num_threads = state.get_int64("num_threads"); + auto const num_iterations = state.get_int64("num_iterations"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); @@ -109,12 +111,15 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, nvtxRangePushA(("(read) " + label).c_str()); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { + [&, num_files = num_files](nvbench::launch& launch, auto& timer) { auto read_func = [&](int index) { auto const stream = streams[index % num_threads]; cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_info_vector[index]); - cudf::io::read_parquet(read_opts, stream, cudf::get_current_device_resource_ref()); + for (int i = 0; i < num_iterations; ++i) { + cudf::io::read_parquet( + read_opts, stream, cudf::get_current_device_resource_ref()); + } }; threads.pause(); @@ -128,7 +133,8 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, nvtxRangePop(); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(num_iterations * static_cast(data_size) / time, + "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size"); @@ -173,6 +179,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, { size_t const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); + auto const num_iterations = state.get_int64("num_iterations"); size_t const input_limit = state.get_int64("input_limit"); size_t const output_limit = state.get_int64("output_limit"); auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); @@ -192,22 +199,25 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, nvtxRangePushA(("(read) " + label).c_str()); std::vector chunks; state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { + [&, num_files = num_files](nvbench::launch& launch, auto& timer) { auto read_func = [&](int index) { auto const stream = streams[index % num_threads]; cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_info_vector[index]); - // divide chunk limits by number of threads so the number of chunks produced is the - // same for all cases. this seems better than the alternative, which is to keep the - // limits the same. if we do that, as the number of threads goes up, the number of - // chunks goes down - so are actually benchmarking the same thing in that case? - auto reader = cudf::io::chunked_parquet_reader( - output_limit / num_threads, input_limit / num_threads, read_opts, stream); - - // read all the chunks - do { - auto table = reader.read_chunk(); - } while (reader.has_next()); + for (int i = 0; i < num_iterations; ++i) { + // divide chunk limits by number of threads so the number of chunks produced is + // the same for all cases. this seems better than the alternative, which is to + // keep the limits the same. if we do that, as the number of threads goes up, the + // number of chunks goes down - so are actually benchmarking the same thing in + // that case? + auto reader = cudf::io::chunked_parquet_reader( + output_limit / num_threads, input_limit / num_threads, read_opts, stream); + + // read all the chunks + do { + auto table = reader.read_chunk(); + } while (reader.has_next()); + } }; threads.pause(); @@ -221,7 +231,8 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, nvtxRangePop(); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(num_iterations * static_cast(data_size) / time, + "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size"); @@ -267,6 +278,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_string_axis("io_type", {"PINNED_BUFFER"}); @@ -277,6 +289,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_string_axis("io_type", {"PINNED_BUFFER"}); @@ -287,6 +300,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_string_axis("io_type", {"PINNED_BUFFER"}); @@ -297,6 +311,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_string_axis("io_type", {"PINNED_BUFFER"}); @@ -308,6 +323,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) @@ -320,6 +336,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) @@ -332,6 +349,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) @@ -344,6 +362,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp index 62925e8d315..685e0dea0e8 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,6 +66,7 @@ void BM_parquet_read_options(nvbench::state& state, auto const data_types = dtypes_for_column_selection(get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 256e50f0e64..84e4b8b93c0 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -89,6 +89,7 @@ void BM_parq_write_io_compression( { auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -143,6 +144,7 @@ void BM_parq_write_varying_options( auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), static_cast(data_type::FLOAT), + static_cast(data_type::BOOL8), static_cast(data_type::DECIMAL), static_cast(data_type::TIMESTAMP), static_cast(data_type::DURATION), @@ -181,6 +183,7 @@ void BM_parq_write_varying_options( using d_type_list = nvbench::enum_type_list -#include -#include -#include - -#include -#include - -#include - -// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to -// run on most GPUs, but large enough to allow highest throughput -constexpr size_t data_size = 512 << 20; - -void parquet_read_common(cudf::size_type num_rows_to_read, - cudf::size_type num_cols_to_read, - cuio_source_sink_pair& source_sink, - nvbench::state& state) -{ - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); - - auto mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec( - nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); - - timer.start(); - auto const result = cudf::io::read_parquet(read_opts); - timer.stop(); - - CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns"); - CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); - }); - - auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); - state.add_buffer_size( - mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); - state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); -} - -template -void bench_batched_memset(nvbench::state& state, nvbench::type_list>) -{ - auto const d_type = get_type_or_group(static_cast(DataType)); - auto const num_cols = static_cast(state.get_int64("num_cols")); - auto const cardinality = static_cast(state.get_int64("cardinality")); - auto const run_length = static_cast(state.get_int64("run_length")); - auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); - auto const compression = cudf::io::compression_type::NONE; - cuio_source_sink_pair source_sink(source_type); - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); - cudf::io::write_parquet(write_opts); - auto const num_rows = view.num_rows(); - - parquet_read_common(num_rows, num_cols, source_sink, state); -} - -using d_type_list = nvbench::enum_type_list; - -NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list)) - .set_name("batched_memset") - .set_type_axes_names({"data_type"}) - .add_int64_axis("num_cols", {1000}) - .add_string_axis("io_type", {"DEVICE_BUFFER"}) - .set_min_samples(4) - .add_int64_axis("cardinality", {0, 1000}) - .add_int64_axis("run_length", {1, 32}); diff --git a/cpp/benchmarks/ndsh/q01.cpp b/cpp/benchmarks/ndsh/q01.cpp index ef709926ae9..485e8e5497c 100644 --- a/cpp/benchmarks/ndsh/q01.cpp +++ b/cpp/benchmarks/ndsh/q01.cpp @@ -104,7 +104,7 @@ } void run_ndsh_q1(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Define the column projections and filter predicate for `lineitem` table std::vector const lineitem_cols = {"l_returnflag", @@ -124,8 +124,8 @@ void run_ndsh_q1(nvbench::state& state, cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal); // Read out the `lineitem` table from parquet file - auto lineitem = - read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred)); + auto lineitem = read_parquet( + sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred)); // Calculate the discount price and charge columns and append to lineitem table auto disc_price = @@ -170,7 +170,7 @@ void ndsh_q1(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources(scale_factor, {"lineitem"}, sources); auto stream = cudf::get_default_stream(); diff --git a/cpp/benchmarks/ndsh/q05.cpp b/cpp/benchmarks/ndsh/q05.cpp index 522bc4789c2..1c2d657913e 100644 --- a/cpp/benchmarks/ndsh/q05.cpp +++ b/cpp/benchmarks/ndsh/q05.cpp @@ -89,7 +89,7 @@ } void run_ndsh_q5(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Define the column projection and filter predicate for the `orders` table std::vector const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"}; @@ -120,17 +120,17 @@ void run_ndsh_q5(nvbench::state& state, // Read out the tables from parquet files // while pushing down the column projections and filter predicates auto const customer = - read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"}); + read_parquet(sources.at("customer").make_source_info(), {"c_custkey", "c_nationkey"}); auto const orders = - read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred)); - auto const lineitem = read_parquet(sources["lineitem"].make_source_info(), + read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred)); + auto const lineitem = read_parquet(sources.at("lineitem").make_source_info(), {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"}); auto const supplier = - read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"}); + read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"}); auto const nation = - read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"}); + read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_regionkey", "n_name"}); auto const region = - read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred)); + read_parquet(sources.at("region").make_source_info(), region_cols, std::move(region_pred)); // Perform the joins auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"}); @@ -165,7 +165,7 @@ void ndsh_q5(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources( scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources); diff --git a/cpp/benchmarks/ndsh/q06.cpp b/cpp/benchmarks/ndsh/q06.cpp index 04078547973..e1e56c3622e 100644 --- a/cpp/benchmarks/ndsh/q06.cpp +++ b/cpp/benchmarks/ndsh/q06.cpp @@ -64,7 +64,7 @@ } void run_ndsh_q6(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Read out the `lineitem` table from parquet file std::vector const lineitem_cols = { @@ -83,8 +83,8 @@ void run_ndsh_q6(nvbench::state& state, cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal); auto const lineitem_pred = std::make_unique( cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b); - auto lineitem = - read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred)); + auto lineitem = read_parquet( + sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred)); // Cast the discount and quantity columns to float32 and append to lineitem table auto discout_float = @@ -134,7 +134,7 @@ void ndsh_q6(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources(scale_factor, {"lineitem"}, sources); auto stream = cudf::get_default_stream(); diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp index 59218ab8912..98c951101ed 100644 --- a/cpp/benchmarks/ndsh/q09.cpp +++ b/cpp/benchmarks/ndsh/q09.cpp @@ -112,20 +112,21 @@ } void run_ndsh_q9(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Read out the table from parquet files auto const lineitem = read_parquet( - sources["lineitem"].make_source_info(), + sources.at("lineitem").make_source_info(), {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"}); - auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"}); + auto const nation = + read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_name"}); auto const orders = - read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"}); - auto const part = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"}); - auto const partsupp = read_parquet(sources["partsupp"].make_source_info(), + read_parquet(sources.at("orders").make_source_info(), {"o_orderkey", "o_orderdate"}); + auto const part = read_parquet(sources.at("part").make_source_info(), {"p_partkey", "p_name"}); + auto const partsupp = read_parquet(sources.at("partsupp").make_source_info(), {"ps_suppkey", "ps_partkey", "ps_supplycost"}); auto const supplier = - read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"}); + read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"}); // Generating the `profit` table // Filter the part table using `p_name like '%green%'` @@ -144,7 +145,8 @@ void run_ndsh_q9(nvbench::state& state, // Calculate the `nation`, `o_year`, and `amount` columns auto n_name = std::make_unique(joined_table->column("n_name")); - auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate")); + auto o_year = cudf::datetime::extract_datetime_component( + joined_table->column("o_orderdate"), cudf::datetime::datetime_component::YEAR); auto amount = calculate_amount(joined_table->column("l_discount"), joined_table->column("l_extendedprice"), joined_table->column("ps_supplycost"), @@ -178,7 +180,7 @@ void ndsh_q9(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources( scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources); diff --git a/cpp/benchmarks/ndsh/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp index a520480020a..72edd15083d 100644 --- a/cpp/benchmarks/ndsh/q10.cpp +++ b/cpp/benchmarks/ndsh/q10.cpp @@ -94,7 +94,7 @@ } void run_ndsh_q10(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Define the column projection and filter predicate for the `orders` table std::vector const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"}; @@ -122,15 +122,16 @@ void run_ndsh_q10(nvbench::state& state, // Read out the tables from parquet files // while pushing down the column projections and filter predicates auto const customer = read_parquet( - sources["customer"].make_source_info(), + sources.at("customer").make_source_info(), {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"}); auto const orders = - read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred)); + read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred)); auto const lineitem = - read_parquet(sources["lineitem"].make_source_info(), + read_parquet(sources.at("lineitem").make_source_info(), {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"}, std::move(lineitem_pred)); - auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"}); + auto const nation = + read_parquet(sources.at("nation").make_source_info(), {"n_name", "n_nationkey"}); // Perform the joins auto const join_a = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"}); @@ -163,7 +164,7 @@ void ndsh_q10(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources( scale_factor, {"customer", "orders", "lineitem", "nation"}, sources); diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index 62116ddf661..9f9849860c9 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -17,6 +17,8 @@ #include "utilities.hpp" #include "common/ndsh_data_generator/ndsh_data_generator.hpp" +#include "common/table_utilities.hpp" +#include "cudf/detail/utilities/integer_utils.hpp" #include #include @@ -30,8 +32,15 @@ #include #include +#include +#include +#include + +#include #include #include +#include +#include namespace { @@ -85,6 +94,15 @@ std::vector const NATION_SCHEMA = { "n_nationkey", "n_name", "n_regionkey", "n_comment"}; std::vector const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"}; +std::unordered_map const> const SCHEMAS = { + {"orders", ORDERS_SCHEMA}, + {"lineitem", LINEITEM_SCHEMA}, + {"part", PART_SCHEMA}, + {"partsupp", PARTSUPP_SCHEMA}, + {"supplier", SUPPLIER_SCHEMA}, + {"customer", CUSTOMER_SCHEMA}, + {"nation", NATION_SCHEMA}, + {"region", REGION_SCHEMA}}; } // namespace cudf::table_view table_with_names::table() const { return tbl->view(); } @@ -337,7 +355,7 @@ int32_t days_since_epoch(int year, int month, int day) void write_to_parquet_device_buffer(std::unique_ptr const& table, std::vector const& col_names, - parquet_device_buffer& source) + cuio_source_sink_pair& source) { CUDF_FUNC_RANGE(); auto const stream = cudf::get_default_stream(); @@ -351,55 +369,124 @@ void write_to_parquet_device_buffer(std::unique_ptr const& table, metadata.schema_info = col_name_infos; auto const table_input_metadata = cudf::io::table_input_metadata{metadata}; - // Declare a host and device buffer - std::vector h_buffer; - + auto est_size = static_cast(estimate_size(table->view())); + constexpr auto PQ_MAX_TABLE_BYTES = 8ul << 30; // 8GB + // TODO: best to get this limit from percent_of_free_device_memory(50) of device memory resource. + if (est_size > PQ_MAX_TABLE_BYTES) { + auto builder = cudf::io::chunked_parquet_writer_options::builder(source.make_sink_info()); + builder.metadata(table_input_metadata); + auto const options = builder.build(); + auto num_splits = static_cast( + std::ceil(static_cast(est_size) / (PQ_MAX_TABLE_BYTES))); + std::vector splits(num_splits - 1); + auto num_rows = table->num_rows(); + auto num_row_per_chunk = cudf::util::div_rounding_up_safe(num_rows, num_splits); + std::generate_n(splits.begin(), splits.size(), [num_row_per_chunk, i = 0]() mutable { + return (i += num_row_per_chunk); + }); + std::vector split_tables = cudf::split(table->view(), splits, stream); + auto writer = cudf::io::parquet_chunked_writer(options, stream); + for (auto const& chunk_table : split_tables) { + writer.write(chunk_table); + } + writer.close(); + return; + } // Write parquet data to host buffer - auto builder = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view()); + auto builder = cudf::io::parquet_writer_options::builder(source.make_sink_info(), table->view()); builder.metadata(table_input_metadata); auto const options = builder.build(); - cudf::io::write_parquet(options); + cudf::io::write_parquet(options, stream); +} - // Copy host buffer to device buffer - source.d_buffer.resize(h_buffer.size(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync( - source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value())); +inline auto make_managed_pool() +{ + return rmm::mr::make_owning_wrapper( + std::make_shared(), rmm::percent_of_free_device_memory(50)); } void generate_parquet_data_sources(double scale_factor, std::vector const& table_names, - std::unordered_map& sources) + std::unordered_map& sources) { CUDF_FUNC_RANGE(); - std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) { - sources[table_name] = parquet_device_buffer(); - }); - auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part( - scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + // Set the memory resource to the managed pool + auto old_mr = cudf::get_current_device_resource(); + // if already managed pool or managed, don't create new one. + using managed_pool_mr_t = decltype(make_managed_pool()); + managed_pool_mr_t managed_pool_mr; + bool const is_managed = + dynamic_cast*>(old_mr) or + dynamic_cast(old_mr); + if (!is_managed) { + std::cout << "Creating managed pool just for data generation\n"; + managed_pool_mr = make_managed_pool(); + cudf::set_current_device_resource(managed_pool_mr.get()); + // drawback: if already pool takes 50% of free memory, we are left with 50% of 50% of free + // memory. + } - auto partsupp = cudf::datagen::generate_partsupp( - scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + std::unordered_set const requested_table_names = [&table_names]() { + if (table_names.empty()) { + return std::unordered_set{ + "orders", "lineitem", "part", "partsupp", "supplier", "customer", "nation", "region"}; + } + return std::unordered_set(table_names.begin(), table_names.end()); + }(); + std::for_each( + requested_table_names.begin(), requested_table_names.end(), [&](auto const& table_name) { + sources.emplace(table_name, cuio_source_sink_pair(io_type::HOST_BUFFER)); + }); + std::unordered_map> tables; + + if (sources.count("orders") or sources.count("lineitem") or sources.count("part")) { + auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + if (sources.count("orders")) { + write_to_parquet_device_buffer(orders, SCHEMAS.at("orders"), sources.at("orders")); + orders = {}; + } + if (sources.count("part")) { + write_to_parquet_device_buffer(part, SCHEMAS.at("part"), sources.at("part")); + part = {}; + } + if (sources.count("lineitem")) { + write_to_parquet_device_buffer(lineitem, SCHEMAS.at("lineitem"), sources.at("lineitem")); + lineitem = {}; + } + } + + if (sources.count("partsupp")) { + auto partsupp = cudf::datagen::generate_partsupp( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(partsupp, SCHEMAS.at("partsupp"), sources.at("partsupp")); + } - auto supplier = cudf::datagen::generate_supplier( - scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + if (sources.count("supplier")) { + auto supplier = cudf::datagen::generate_supplier( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(supplier, SCHEMAS.at("supplier"), sources.at("supplier")); + } - auto customer = cudf::datagen::generate_customer( - scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + if (sources.count("customer")) { + auto customer = cudf::datagen::generate_customer( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(customer, SCHEMAS.at("customer"), sources.at("customer")); + } - auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(), - cudf::get_current_device_resource_ref()); + if (sources.count("nation")) { + auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(nation, SCHEMAS.at("nation"), sources.at("nation")); + } - auto region = cudf::datagen::generate_region(cudf::get_default_stream(), - cudf::get_current_device_resource_ref()); + if (sources.count("region")) { + auto region = cudf::datagen::generate_region(cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(region, SCHEMAS.at("region"), sources.at("region")); + } - write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]); - write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]); - write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]); - write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]); - write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]); - write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]); - write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]); - write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]); + // Restore the original memory resource + if (!is_managed) { cudf::set_current_device_resource(old_mr); } } diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp index 762e43deccf..cae07f86a98 100644 --- a/cpp/benchmarks/ndsh/utilities.hpp +++ b/cpp/benchmarks/ndsh/utilities.hpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/cuio_common.hpp" + #include #include #include @@ -196,24 +198,15 @@ std::tm make_tm(int year, int month, int day); int32_t days_since_epoch(int year, int month, int day); /** - * @brief Struct representing a parquet device buffer - */ -struct parquet_device_buffer { - parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {}; - cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); } - rmm::device_uvector d_buffer; -}; - -/** - * @brief Write a `cudf::table` to a parquet device buffer + * @brief Write a `cudf::table` to a parquet cuio sink * * @param table The `cudf::table` to write * @param col_names The column names of the table - * @param parquet_device_buffer The parquet device buffer to write the table to + * @param source The source sink pair to write the table to */ void write_to_parquet_device_buffer(std::unique_ptr const& table, std::vector const& col_names, - parquet_device_buffer& source); + cuio_source_sink_pair& source); /** * @brief Generate NDS-H tables and write to parquet device buffers @@ -224,4 +217,4 @@ void write_to_parquet_device_buffer(std::unique_ptr const& table, */ void generate_parquet_data_sources(double scale_factor, std::vector const& table_names, - std::unordered_map& sources); + std::unordered_map& sources); diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp new file mode 100644 index 00000000000..d0925de5c87 --- /dev/null +++ b/cpp/benchmarks/reduction/histogram.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cudf/aggregation.hpp" +#include "cudf/detail/aggregation/aggregation.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +template +static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list) +{ + auto const dtype = cudf::type_to_id(); + + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + + if (cardinality > num_rows) { + state.skip("cardinality > num_rows"); + return; + } + + data_profile const profile = data_profile_builder() + .null_probability(null_probability) + .cardinality(cardinality) + .distribution(dtype, distribution_id::UNIFORM, 0, num_rows); + + auto const input = create_random_column(dtype, row_count{num_rows}, profile); + auto agg = cudf::make_histogram_aggregation(); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::reduce(*input, *agg, input->type(), stream_view); + }); + + state.add_element_count(input->size()); +} + +using data_type = nvbench::type_list; + +NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type)) + .set_name("histogram") + .add_float64_axis("null_probability", {0.1}) + .add_int64_axis("cardinality", + {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000}) + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp index 7acfb1ffb0d..d6ccfae63e8 100644 --- a/cpp/benchmarks/string/combine.cpp +++ b/cpp/benchmarks/string/combine.cpp @@ -14,57 +14,41 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include -#include -#include #include #include #include #include -class StringCombine : public cudf::benchmark {}; +#include -static void BM_combine(benchmark::State& state) +static void bench_combine(nvbench::state& state) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + data_profile const profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const table = create_random_table( - {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile); + {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, profile); cudf::strings_column_view input1(table->view().column(0)); cudf::strings_column_view input2(table->view().column(1)); cudf::string_scalar separator("+"); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::concatenate(table->view(), separator); - } - - state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) + - input2.chars_size(cudf::get_default_stream()))); -} + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = + input1.chars_size(stream) + input2.chars_size(stream) + (num_rows * separator.size()); + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 4; - int const max_rowlen = 1 << 11; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::strings::concatenate(table->view(), separator); + }); } -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringCombine, name) \ - (::benchmark::State & st) { BM_combine(st); } \ - BENCHMARK_REGISTER_F(StringCombine, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(concat) +NVBENCH_BENCH(bench_combine) + .set_name("concat") + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index ae6c8b844c8..a73017dda18 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -17,10 +17,6 @@ #include #include -#include - -#include -#include #include #include #include @@ -28,57 +24,6 @@ #include -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate) -{ - // build input table using the following data - auto raw_data = cudf::test::strings_column_wrapper( - { - "123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns; - "012345 6789 01234 56789 0123 456", // the rest do not match - "abc 4567890 DEFGHI 0987 Wxyz 123", - "abcdefghijklmnopqrstuvwxyz 01234", - "", - "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", - "9876543210,abcdefghijklmnopqrstU", - "9876543210,abcdefghijklmnopqrstU", - "123 édf 4567890 DéFG 0987 X5", - "1", - }) - .release(); - - if (row_width / 32 > 1) { - std::vector columns; - for (int i = 0; i < row_width / 32; ++i) { - columns.push_back(raw_data->view()); - } - raw_data = cudf::strings::concatenate(cudf::table_view(columns)); - } - auto data_view = raw_data->view(); - - // compute number of rows in n_rows that should match - auto matches = static_cast(n_rows * hit_rate) / 100; - - // Create a randomized gather-map to build a column out of the strings in data. - data_profile gather_profile = - data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); - auto gather_table = - create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile); - gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); - - // Create scatter map by placing 0-index values throughout the gather-map - auto scatter_data = cudf::sequence( - matches, cudf::numeric_scalar(0), cudf::numeric_scalar(n_rows / matches)); - auto zero_scalar = cudf::numeric_scalar(0); - auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); - auto gather_map = table->view().column(0); - table = cudf::gather(cudf::table_view({data_view}), gather_map); - - return std::move(table->release().front()); -} - // longer pattern lengths demand more working memory per string std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"}; @@ -94,7 +39,7 @@ static void bench_contains(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - auto col = build_input_column(n_rows, row_width, hit_rate); + auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp index 5deca3664b7..288aa6029d3 100644 --- a/cpp/benchmarks/string/convert_datetime.cpp +++ b/cpp/benchmarks/string/convert_datetime.cpp @@ -16,62 +16,59 @@ #include #include -#include -#include #include #include +#include #include -class StringDateTime : public cudf::benchmark {}; +#include -enum class direction { to, from }; +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_D, "cudf::timestamp_D", "cudf::timestamp_D"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_s, "cudf::timestamp_s", "cudf::timestamp_s"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_us, "cudf::timestamp_us", "cudf::timestamp_us"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ns, "cudf::timestamp_ns", "cudf::timestamp_ns"); -template -void BM_convert_datetime(benchmark::State& state, direction dir) +using Types = nvbench::type_list; + +template +void bench_convert_datetime(nvbench::state& state, nvbench::type_list) { - auto const n_rows = static_cast(state.range(0)); - auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const from_ts = state.get_string("dir") == "from"; - auto const column = create_random_column(data_type.id(), row_count{n_rows}); - cudf::column_view input(column->view()); + auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const ts_col = create_random_column(data_type.id(), row_count{num_rows}); - auto source = dir == direction::to ? cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S") - : make_empty_column(cudf::data_type{cudf::type_id::STRING}); - cudf::strings_column_view source_string(source->view()); + auto format = std::string{"%Y-%m-%d %H:%M:%S"}; + auto s_col = cudf::strings::from_timestamps(ts_col->view(), format); + auto sv = cudf::strings_column_view(s_col->view()); - for (auto _ : state) { - cuda_event_timer raii(state, true); - if (dir == direction::to) - cudf::strings::to_timestamps(source_string, data_type, "%Y-%m-%d %H:%M:%S"); - else - cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S"); - } + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - auto const bytes = dir == direction::to ? source_string.chars_size(cudf::get_default_stream()) - : n_rows * sizeof(TypeParam); - state.SetBytesProcessed(state.iterations() * bytes); + if (from_ts) { + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(sv.chars_size(stream)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::from_timestamps(ts_col->view(), format); + }); + } else { + state.add_global_memory_reads(sv.chars_size(stream)); + state.add_global_memory_writes(num_rows); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::to_timestamps(sv, data_type, format); + }); + } } -#define STR_BENCHMARK_DEFINE(name, type, dir) \ - BENCHMARK_DEFINE_F(StringDateTime, name)(::benchmark::State & state) \ - { \ - BM_convert_datetime(state, dir); \ - } \ - BENCHMARK_REGISTER_F(StringDateTime, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -STR_BENCHMARK_DEFINE(from_days, cudf::timestamp_D, direction::from); -STR_BENCHMARK_DEFINE(from_seconds, cudf::timestamp_s, direction::from); -STR_BENCHMARK_DEFINE(from_mseconds, cudf::timestamp_ms, direction::from); -STR_BENCHMARK_DEFINE(from_useconds, cudf::timestamp_us, direction::from); -STR_BENCHMARK_DEFINE(from_nseconds, cudf::timestamp_ns, direction::from); - -STR_BENCHMARK_DEFINE(to_days, cudf::timestamp_D, direction::to); -STR_BENCHMARK_DEFINE(to_seconds, cudf::timestamp_s, direction::to); -STR_BENCHMARK_DEFINE(to_mseconds, cudf::timestamp_ms, direction::to); -STR_BENCHMARK_DEFINE(to_useconds, cudf::timestamp_us, direction::to); -STR_BENCHMARK_DEFINE(to_nseconds, cudf::timestamp_ns, direction::to); +NVBENCH_BENCH_TYPES(bench_convert_datetime, NVBENCH_TYPE_AXES(Types)) + .set_name("datetime") + .set_type_axes_names({"DataType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 16, 1 << 18, 1 << 20, 1 << 22}); diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp index f12d292c2e7..9d2377f2d82 100644 --- a/cpp/benchmarks/string/convert_durations.cpp +++ b/cpp/benchmarks/string/convert_durations.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,92 +14,60 @@ * limitations under the License. */ +#include #include -#include - -#include #include #include -#include +#include #include -#include -#include - -class DurationsToString : public cudf::benchmark {}; -template -void BM_convert_from_durations(benchmark::State& state) -{ - cudf::size_type const source_size = state.range(0); - - // Every element is valid - auto data = cudf::detail::make_counting_transform_iterator( - 0, [source_size](auto i) { return TypeParam{i - source_size / 2}; }); +#include - cudf::test::fixed_width_column_wrapper source_durations(data, data + source_size); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_D, "cudf::duration_D", "cudf::duration_D"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_s, "cudf::duration_s", "cudf::duration_s"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_ms, "cudf::duration_ms", "cudf::duration_ms"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_us, "cudf::duration_us", "cudf::duration_us"); +NVBENCH_DECLARE_TYPE_STRINGS(cudf::duration_ns, "cudf::duration_ns", "cudf::duration_ns"); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::strings::from_durations(source_durations, "%D days %H:%M:%S"); - } - - state.SetBytesProcessed(state.iterations() * source_size * sizeof(TypeParam)); -} +using Types = nvbench::type_list; -class StringToDurations : public cudf::benchmark {}; -template -void BM_convert_to_durations(benchmark::State& state) +template +void bench_convert_duration(nvbench::state& state, nvbench::type_list) { - cudf::size_type const source_size = state.range(0); - - // Every element is valid - auto data = cudf::detail::make_counting_transform_iterator( - 0, [source_size](auto i) { return TypeParam{i - source_size / 2}; }); - - cudf::test::fixed_width_column_wrapper source_durations(data, data + source_size); - auto results = cudf::strings::from_durations(source_durations, "%D days %H:%M:%S"); - cudf::strings_column_view source_string(*results); - auto output_type = cudf::data_type(cudf::type_to_id()); - - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::strings::to_durations(source_string, output_type, "%D days %H:%M:%S"); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const from_dur = state.get_string("dir") == "from"; + + auto const ts_col = create_random_column(data_type.id(), row_count{num_rows}); + cudf::column_view input(ts_col->view()); + + auto format = std::string{"%D days %H:%M:%S"}; + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (from_dur) { + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(format.size() * num_rows); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::from_durations(input, format); }); + } else { + auto source = cudf::strings::from_durations(input, format); + auto view = cudf::strings_column_view(source->view()); + state.add_global_memory_reads(view.chars_size(stream)); + state.add_global_memory_writes(num_rows); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::to_durations(view, data_type, format); + }); } - - state.SetBytesProcessed(state.iterations() * source_size * sizeof(TypeParam)); } -#define DSBM_BENCHMARK_DEFINE(name, type) \ - BENCHMARK_DEFINE_F(DurationsToString, name)(::benchmark::State & state) \ - { \ - BM_convert_from_durations(state); \ - } \ - BENCHMARK_REGISTER_F(DurationsToString, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -#define SDBM_BENCHMARK_DEFINE(name, type) \ - BENCHMARK_DEFINE_F(StringToDurations, name)(::benchmark::State & state) \ - { \ - BM_convert_to_durations(state); \ - } \ - BENCHMARK_REGISTER_F(StringToDurations, name) \ - ->RangeMultiplier(1 << 5) \ - ->Range(1 << 10, 1 << 25) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -DSBM_BENCHMARK_DEFINE(from_durations_D, cudf::duration_D); -DSBM_BENCHMARK_DEFINE(from_durations_s, cudf::duration_s); -DSBM_BENCHMARK_DEFINE(from_durations_ms, cudf::duration_ms); -DSBM_BENCHMARK_DEFINE(from_durations_us, cudf::duration_us); -DSBM_BENCHMARK_DEFINE(from_durations_ns, cudf::duration_ns); - -SDBM_BENCHMARK_DEFINE(to_durations_D, cudf::duration_D); -SDBM_BENCHMARK_DEFINE(to_durations_s, cudf::duration_s); -SDBM_BENCHMARK_DEFINE(to_durations_ms, cudf::duration_ms); -SDBM_BENCHMARK_DEFINE(to_durations_us, cudf::duration_us); -SDBM_BENCHMARK_DEFINE(to_durations_ns, cudf::duration_ns); +NVBENCH_BENCH_TYPES(bench_convert_duration, NVBENCH_TYPE_AXES(Types)) + .set_name("duration") + .set_type_axes_names({"DataType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 10, 1 << 15, 1 << 20, 1 << 25}); diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp index e5bd794e405..97e114c0795 100644 --- a/cpp/benchmarks/string/convert_fixed_point.cpp +++ b/cpp/benchmarks/string/convert_fixed_point.cpp @@ -16,93 +16,48 @@ #include #include -#include #include #include #include -namespace { +#include -std::unique_ptr get_strings_column(cudf::size_type rows) -{ - auto result = - create_random_column(cudf::type_id::FLOAT32, row_count{static_cast(rows)}); - return cudf::strings::from_floats(result->view()); -} - -} // anonymous namespace - -class StringsToFixedPoint : public cudf::benchmark {}; - -template -void convert_to_fixed_point(benchmark::State& state) -{ - auto const rows = static_cast(state.range(0)); - auto const strings_col = get_strings_column(rows); - auto const strings_view = cudf::strings_column_view(strings_col->view()); - auto const dtype = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; - - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto volatile results = cudf::strings::to_fixed_point(strings_view, dtype); - } +using Types = nvbench::type_list; - // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed( - state.iterations() * - (strings_view.chars_size(cudf::get_default_stream()) + rows * cudf::size_of(dtype))); -} - -class StringsFromFixedPoint : public cudf::benchmark {}; +NVBENCH_DECLARE_TYPE_STRINGS(numeric::decimal32, "decimal32", "decimal32"); +NVBENCH_DECLARE_TYPE_STRINGS(numeric::decimal64, "decimal64", "decimal64"); -template -void convert_from_fixed_point(benchmark::State& state) +template +void bench_convert_fixed_point(nvbench::state& state, nvbench::type_list) { - auto const rows = static_cast(state.range(0)); - auto const strings_col = get_strings_column(rows); - auto const dtype = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; - auto const fp_col = - cudf::strings::to_fixed_point(cudf::strings_column_view(strings_col->view()), dtype); - - std::unique_ptr results = nullptr; - - for (auto _ : state) { - cuda_event_timer raii(state, true); - results = cudf::strings::from_fixed_point(fp_col->view()); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const from_num = state.get_string("dir") == "from"; + + auto const data_type = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; + auto const fp_col = create_random_column(data_type.id(), row_count{num_rows}); + + auto const strings_col = cudf::strings::from_fixed_point(fp_col->view()); + auto const sv = cudf::strings_column_view(strings_col->view()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (from_num) { + state.add_global_memory_reads(num_rows * cudf::size_of(data_type)); + state.add_global_memory_writes(sv.chars_size(stream)); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::to_fixed_point(sv, data_type); }); + } else { + state.add_global_memory_reads(sv.chars_size(stream)); + state.add_global_memory_writes(num_rows * cudf::size_of(data_type)); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::from_fixed_point(fp_col->view()); }); } - - // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed( - state.iterations() * - (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) + - rows * cudf::size_of(dtype))); } -#define CONVERT_TO_FIXED_POINT_BMD(name, fixed_point_type) \ - BENCHMARK_DEFINE_F(StringsToFixedPoint, name)(::benchmark::State & state) \ - { \ - convert_to_fixed_point(state); \ - } \ - BENCHMARK_REGISTER_F(StringsToFixedPoint, name) \ - ->RangeMultiplier(4) \ - ->Range(1 << 12, 1 << 24) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -#define CONVERT_FROM_FIXED_POINT_BMD(name, fixed_point_type) \ - BENCHMARK_DEFINE_F(StringsFromFixedPoint, name)(::benchmark::State & state) \ - { \ - convert_from_fixed_point(state); \ - } \ - BENCHMARK_REGISTER_F(StringsFromFixedPoint, name) \ - ->RangeMultiplier(4) \ - ->Range(1 << 12, 1 << 24) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -CONVERT_TO_FIXED_POINT_BMD(strings_to_decimal32, numeric::decimal32); -CONVERT_TO_FIXED_POINT_BMD(strings_to_decimal64, numeric::decimal64); - -CONVERT_FROM_FIXED_POINT_BMD(strings_from_decimal32, numeric::decimal32); -CONVERT_FROM_FIXED_POINT_BMD(strings_from_decimal64, numeric::decimal64); +NVBENCH_BENCH_TYPES(bench_convert_fixed_point, NVBENCH_TYPE_AXES(Types)) + .set_name("fixed_point") + .set_type_axes_names({"DataType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 16, 1 << 18, 1 << 20, 1 << 22}); diff --git a/cpp/benchmarks/string/convert_numerics.cpp b/cpp/benchmarks/string/convert_numerics.cpp index 8f875c5c80f..e1f650dd6cd 100644 --- a/cpp/benchmarks/string/convert_numerics.cpp +++ b/cpp/benchmarks/string/convert_numerics.cpp @@ -16,117 +16,67 @@ #include #include -#include #include #include #include -namespace { +#include -template -std::unique_ptr get_numerics_column(cudf::size_type rows) -{ - return create_random_column(cudf::type_to_id(), row_count{rows}); -} +namespace { template -std::unique_ptr get_strings_column(cudf::size_type rows) +std::unique_ptr get_strings_column(cudf::column_view const& nv) { - auto const numerics_col = get_numerics_column(rows); if constexpr (std::is_floating_point_v) { - return cudf::strings::from_floats(numerics_col->view()); + return cudf::strings::from_floats(nv); } else { - return cudf::strings::from_integers(numerics_col->view()); - } -} -} // anonymous namespace - -class StringsToNumeric : public cudf::benchmark {}; - -template -void convert_to_number(benchmark::State& state) -{ - auto const rows = static_cast(state.range(0)); - - auto const strings_col = get_strings_column(rows); - auto const strings_view = cudf::strings_column_view(strings_col->view()); - auto const col_type = cudf::type_to_id(); - - for (auto _ : state) { - cuda_event_timer raii(state, true); - if constexpr (std::is_floating_point_v) { - cudf::strings::to_floats(strings_view, cudf::data_type{col_type}); - } else { - cudf::strings::to_integers(strings_view, cudf::data_type{col_type}); - } + return cudf::strings::from_integers(nv); } - - // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed( - state.iterations() * - (strings_view.chars_size(cudf::get_default_stream()) + rows * sizeof(NumericType))); } +} // namespace -class StringsFromNumeric : public cudf::benchmark {}; +using Types = nvbench::type_list; template -void convert_from_number(benchmark::State& state) +void bench_convert_number(nvbench::state& state, nvbench::type_list) { - auto const rows = static_cast(state.range(0)); - - auto const numerics_col = get_numerics_column(rows); - auto const numerics_view = numerics_col->view(); - - std::unique_ptr results = nullptr; - - for (auto _ : state) { - cuda_event_timer raii(state, true); - if constexpr (std::is_floating_point_v) - results = cudf::strings::from_floats(numerics_view); - else - results = cudf::strings::from_integers(numerics_view); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const from_num = state.get_string("dir") == "from"; + + auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const num_col = create_random_column(data_type.id(), row_count{num_rows}); + + auto const strings_col = get_strings_column(num_col->view()); + auto const sv = cudf::strings_column_view(strings_col->view()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (from_num) { + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(sv.chars_size(stream)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if constexpr (std::is_floating_point_v) { + cudf::strings::to_floats(sv, data_type); + } else { + cudf::strings::to_integers(sv, data_type); + } + }); + } else { + state.add_global_memory_reads(sv.chars_size(stream)); + state.add_global_memory_writes(num_rows); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if constexpr (std::is_floating_point_v) + cudf::strings::from_floats(num_col->view()); + else + cudf::strings::from_integers(num_col->view()); + }); } - - // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed( - state.iterations() * - (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) + - rows * sizeof(NumericType))); } -#define CONVERT_TO_NUMERICS_BD(name, type) \ - BENCHMARK_DEFINE_F(StringsToNumeric, name)(::benchmark::State & state) \ - { \ - convert_to_number(state); \ - } \ - BENCHMARK_REGISTER_F(StringsToNumeric, name) \ - ->RangeMultiplier(4) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -#define CONVERT_FROM_NUMERICS_BD(name, type) \ - BENCHMARK_DEFINE_F(StringsFromNumeric, name)(::benchmark::State & state) \ - { \ - convert_from_number(state); \ - } \ - BENCHMARK_REGISTER_F(StringsFromNumeric, name) \ - ->RangeMultiplier(4) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -CONVERT_TO_NUMERICS_BD(strings_to_float32, float); -CONVERT_TO_NUMERICS_BD(strings_to_float64, double); -CONVERT_TO_NUMERICS_BD(strings_to_int32, int32_t); -CONVERT_TO_NUMERICS_BD(strings_to_int64, int64_t); -CONVERT_TO_NUMERICS_BD(strings_to_uint8, uint8_t); -CONVERT_TO_NUMERICS_BD(strings_to_uint16, uint16_t); - -CONVERT_FROM_NUMERICS_BD(strings_from_float32, float); -CONVERT_FROM_NUMERICS_BD(strings_from_float64, double); -CONVERT_FROM_NUMERICS_BD(strings_from_int32, int32_t); -CONVERT_FROM_NUMERICS_BD(strings_from_int64, int64_t); -CONVERT_FROM_NUMERICS_BD(strings_from_uint8, uint8_t); -CONVERT_FROM_NUMERICS_BD(strings_from_uint16, uint16_t); +NVBENCH_BENCH_TYPES(bench_convert_number, NVBENCH_TYPE_AXES(Types)) + .set_name("numeric") + .set_type_axes_names({"NumericType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 16, 1 << 18, 1 << 20, 1 << 22}); diff --git a/cpp/benchmarks/string/copy.cpp b/cpp/benchmarks/string/copy.cpp new file mode 100644 index 00000000000..2baccd4fad1 --- /dev/null +++ b/cpp/benchmarks/string/copy.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_copy(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const api = state.get_string("api"); + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const source = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + + data_profile const map_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + auto const map_table = + create_random_table({cudf::type_to_id()}, row_count{num_rows}, map_profile); + auto const map_view = map_table->view().column(0); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (api == "gather") { + auto result = + cudf::gather(source->view(), map_view, cudf::out_of_bounds_policy::NULLIFY, stream); + auto chars_size = cudf::strings_column_view(result->view().column(0)).chars_size(stream); + state.add_global_memory_reads(chars_size + + (map_view.size() * sizeof(cudf::size_type))); + state.add_global_memory_writes(chars_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::gather(source->view(), map_view, cudf::out_of_bounds_policy::NULLIFY, stream); + }); + } else if (api == "scatter") { + auto const target = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + auto result = cudf::scatter(source->view(), map_view, target->view(), stream); + auto chars_size = cudf::strings_column_view(result->view().column(0)).chars_size(stream); + state.add_global_memory_reads(chars_size + + (map_view.size() * sizeof(cudf::size_type))); + state.add_global_memory_writes(chars_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::scatter(source->view(), map_view, target->view(), stream); + }); + } +} + +NVBENCH_BENCH(bench_copy) + .set_name("copy") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_string_axis("api", {"gather", "scatter"}); diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu deleted file mode 100644 index 6b2f6c3a0a7..00000000000 --- a/cpp/benchmarks/string/copy.cu +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "string_bench_args.hpp" - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -class StringCopy : public cudf::benchmark {}; - -enum copy_type { gather, scatter }; - -static void BM_copy(benchmark::State& state, copy_type ct) -{ - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - - auto const source = - create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile); - auto const target = - create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile); - - // scatter indices - auto index_map_col = make_numeric_column( - cudf::data_type{cudf::type_id::INT32}, n_rows, cudf::mask_state::UNALLOCATED); - auto index_map = index_map_col->mutable_view(); - thrust::shuffle_copy(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(n_rows), - index_map.begin(), - thrust::default_random_engine()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (ct) { - case gather: cudf::gather(source->view(), index_map); break; - case scatter: cudf::scatter(source->view(), index_map, target->view()); break; - } - } - - state.SetBytesProcessed( - state.iterations() * - cudf::strings_column_view(source->view().column(0)).chars_size(cudf::get_default_stream())); -} - -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); - - // Benchmark for very small strings - b->Args({67108864, 2}); -} - -#define COPY_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringCopy, name) \ - (::benchmark::State & st) { BM_copy(st, copy_type::name); } \ - BENCHMARK_REGISTER_F(StringCopy, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -COPY_BENCHMARK_DEFINE(gather) -COPY_BENCHMARK_DEFINE(scatter) diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp index 613834b1f3e..9233db6716d 100644 --- a/cpp/benchmarks/string/filter.cpp +++ b/cpp/benchmarks/string/filter.cpp @@ -14,13 +14,7 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include -#include -#include - -#include #include #include @@ -29,57 +23,58 @@ #include #include -#include - -enum FilterAPI { filter, filter_chars, strip }; +#include -class StringFilterChars : public cudf::benchmark {}; +#include -static void BM_filter_chars(benchmark::State& state, FilterAPI api) +static void bench_filter(nvbench::state& state) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const api = state.get_string("api"); + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); - cudf::strings_column_view input(column->view()); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const input = cudf::strings_column_view(column->view()); - auto const types = cudf::strings::string_character_types::SPACE; - std::vector> filter_table{ - {cudf::char_utf8{'a'}, cudf::char_utf8{'c'}}}; + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (api) { - case filter: cudf::strings::filter_characters_of_type(input, types); break; - case filter_chars: cudf::strings::filter_characters(input, filter_table); break; - case strip: cudf::strings::strip(input); break; + if (api == "filter") { + auto const types = cudf::strings::string_character_types::SPACE; + { + auto result = cudf::strings::filter_characters_of_type(input, types); + auto sv = cudf::strings_column_view(result->view()); + state.add_global_memory_writes(sv.chars_size(stream)); } + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::filter_characters_of_type(input, types); + }); + } else if (api == "chars") { + state.add_global_memory_writes(chars_size); + std::vector> filter_table{ + {cudf::char_utf8{'a'}, cudf::char_utf8{'c'}}}; + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::filter_characters(input, filter_table); + }); + } else if (api == "strip") { + { + auto result = cudf::strings::strip(input); + auto sv = cudf::strings_column_view(result->view()); + state.add_global_memory_writes(sv.chars_size(stream)); + } + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::strip(input); }); } - - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} - -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_multiplier = 8; - int const min_length = 1 << 5; - int const max_length = 1 << 13; - int const length_multiplier = 2; - generate_string_bench_args( - b, min_rows, max_rows, row_multiplier, min_length, max_length, length_multiplier); } -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringFilterChars, name) \ - (::benchmark::State & st) { BM_filter_chars(st, FilterAPI::name); } \ - BENCHMARK_REGISTER_F(StringFilterChars, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(filter) -STRINGS_BENCHMARK_DEFINE(filter_chars) -STRINGS_BENCHMARK_DEFINE(strip) +NVBENCH_BENCH(bench_filter) + .set_name("filter") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_string_axis("api", {"filter", "chars", "strip"}); diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index a9c620e4bf0..3ea3ff13a2f 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -19,20 +19,13 @@ #include -#include #include -#include #include -#include #include #include #include -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate); - static void bench_find_string(nvbench::state& state) { auto const n_rows = static_cast(state.get_int64("num_rows")); @@ -46,18 +39,16 @@ static void bench_find_string(nvbench::state& state) } auto const stream = cudf::get_default_stream(); - auto const col = build_input_column(n_rows, row_width, hit_rate); + auto const col = create_string_column(n_rows, row_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); - std::vector h_targets({"5W", "5W43", "0987 5W43"}); - cudf::string_scalar target(h_targets[2]); - cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + cudf::string_scalar target("0987 5W43"); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); auto const chars_size = input.chars_size(stream); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); - if (api.substr(0, 4) == "find") { + if (api == "find") { state.add_global_memory_writes(input.size()); } else { state.add_global_memory_writes(input.size()); @@ -66,10 +57,6 @@ static void bench_find_string(nvbench::state& state) if (api == "find") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); - } else if (api == "find_multi") { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); - }); } else if (api == "contains") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); @@ -84,7 +71,7 @@ static void bench_find_string(nvbench::state& state) NVBENCH_BENCH(bench_find_string) .set_name("find_string") - .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"}) + .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"}) .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/benchmarks/string/find_multiple.cpp b/cpp/benchmarks/string/find_multiple.cpp new file mode 100644 index 00000000000..0e780fdb302 --- /dev/null +++ b/cpp/benchmarks/string/find_multiple.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include +#include +#include + +#include + +static void bench_find_string(nvbench::state& state) +{ + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + auto const target_count = static_cast(state.get_int64("targets")); + auto const api = state.get_string("api"); + + auto const stream = cudf::get_default_stream(); + auto const col = create_string_column(n_rows, row_width, hit_rate); + auto const input = cudf::strings_column_view(col->view()); + + // Note that these all match the first row of the raw_data in create_string_column. + // This is so the hit_rate can properly accounted for. + std::vector const target_data( + {" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"}); + auto h_targets = std::vector{}; + for (cudf::size_type i = 0; i < target_count; i++) { + h_targets.emplace_back(target_data[i % target_data.size()]); + } + cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); + if (api == "find") { + state.add_global_memory_writes(input.size()); + } else { + state.add_global_memory_writes(input.size()); + } + + if (api == "find") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::find_multiple(input, cudf::strings_column_view(targets)); + }); + } else if (api == "contains") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::contains_multiple(input, cudf::strings_column_view(targets)); + }); + } +} + +NVBENCH_BENCH(bench_find_string) + .set_name("find_multiple") + .add_string_axis("api", {"find", "contains"}) + .add_int64_axis("targets", {10, 20, 40}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp deleted file mode 100644 index 5b1c679be7d..00000000000 --- a/cpp/benchmarks/string/gather.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include - -#include - -static void bench_gather(nvbench::state& state) -{ - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const input_table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); - - data_profile const map_profile = data_profile_builder().no_validity().distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows); - auto const map_table = - create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile); - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = - cudf::strings_column_view(input_table->view().column(0)).chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); // all bytes are read; - state.add_global_memory_writes(chars_size); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = cudf::gather( - input_table->view(), map_table->view().column(0), cudf::out_of_bounds_policy::NULLIFY); - }); -} - -NVBENCH_BENCH(bench_gather) - .set_name("gather") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 99cef640dc3..105ae65cbe8 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -18,68 +18,12 @@ #include -#include -#include -#include #include #include #include #include -namespace { -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate) -{ - // build input table using the following data - auto raw_data = cudf::test::strings_column_wrapper( - { - "123 abc 4567890 DEFGHI 0987 5W43", // matches always; - "012345 6789 01234 56789 0123 456", // the rest do not match - "abc 4567890 DEFGHI 0987 Wxyz 123", - "abcdefghijklmnopqrstuvwxyz 01234", - "", - "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", - "9876543210,abcdefghijklmnopqrstU", - "9876543210,abcdefghijklmnopqrstU", - "123 édf 4567890 DéFG 0987 X5", - "1", - }) - .release(); - if (row_width / 32 > 1) { - std::vector columns; - for (int i = 0; i < row_width / 32; ++i) { - columns.push_back(raw_data->view()); - } - raw_data = cudf::strings::concatenate(cudf::table_view(columns)); - } - auto data_view = raw_data->view(); - - // compute number of rows in n_rows that should match - auto matches = static_cast(n_rows * hit_rate) / 100; - - // Create a randomized gather-map to build a column out of the strings in data. - data_profile gather_profile = - data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); - auto gather_table = - create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile); - gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); - - // Create scatter map by placing 0-index values throughout the gather-map - auto scatter_data = cudf::sequence( - matches, cudf::numeric_scalar(0), cudf::numeric_scalar(n_rows / matches)); - auto zero_scalar = cudf::numeric_scalar(0); - auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); - auto gather_map = table->view().column(0); - table = cudf::gather(cudf::table_view({data_view}), gather_map); - - return std::move(table->release().front()); -} - -} // namespace - static void bench_like(nvbench::state& state) { auto const n_rows = static_cast(state.get_int64("num_rows")); @@ -91,7 +35,7 @@ static void bench_like(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - auto col = build_input_column(n_rows, row_width, hit_rate); + auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); // This pattern forces reading the entire target string (when matched expected) diff --git a/cpp/benchmarks/string/make_strings_column.cu b/cpp/benchmarks/string/make_strings_column.cu new file mode 100644 index 00000000000..e86824b9f40 --- /dev/null +++ b/cpp/benchmarks/string/make_strings_column.cu @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +namespace { + +constexpr int min_row_width = 0; +constexpr int max_row_width = 50; + +using string_index_pair = thrust::pair; + +template +std::vector> make_strings_columns( + std::vector> const& input, + rmm::cuda_stream_view stream) +{ + if constexpr (batch_construction) { + return cudf::make_strings_column_batch(input, stream); + } else { + std::vector> output; + output.reserve(input.size()); + for (auto const& column_input : input) { + output.emplace_back(cudf::make_strings_column(column_input, stream)); + } + return output; + } +} + +} // namespace + +static void BM_make_strings_column_batch(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const batch_size = static_cast(state.get_int64("batch_size")); + auto const has_nulls = true; + + data_profile const table_profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width) + .null_probability(has_nulls ? std::optional{0.1} : std::nullopt); + auto const data_table = create_random_table( + cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile); + + auto const stream = cudf::get_default_stream(); + auto input_data = std::vector>{}; + auto input = std::vector>{}; + input_data.reserve(batch_size); + input.reserve(batch_size); + for (auto const& cv : data_table->view()) { + auto const d_data_ptr = cudf::column_device_view::create(cv, stream); + auto batch_input = rmm::device_uvector(cv.size(), stream); + thrust::tabulate(rmm::exec_policy(stream), + batch_input.begin(), + batch_input.end(), + [data_col = *d_data_ptr] __device__(auto const idx) { + if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; } + auto const row = data_col.element(idx); + return string_index_pair{row.data(), row.size_bytes()}; + }); + input_data.emplace_back(std::move(batch_input)); + input.emplace_back(input_data.back()); + } + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + [[maybe_unused]] auto const output = make_strings_columns(input, stream); + }); +} + +NVBENCH_BENCH(BM_make_strings_column_batch) + .set_name("make_strings_column_batch") + .add_int64_axis("num_rows", {100'000, 500'000, 1'000'000, 2'000'000}) + .add_int64_axis("batch_size", {10, 20, 50, 100}); diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp index f1d1516f248..29012e2cbf9 100644 --- a/cpp/benchmarks/string/repeat_strings.cpp +++ b/cpp/benchmarks/string/repeat_strings.cpp @@ -14,99 +14,58 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include #include -#include #include #include #include -static constexpr cudf::size_type default_repeat_times = 16; -static constexpr cudf::size_type min_repeat_times = -16; -static constexpr cudf::size_type max_repeat_times = 16; +#include -static std::unique_ptr create_data_table(cudf::size_type n_cols, - cudf::size_type n_rows, - cudf::size_type max_str_length) +static void bench_repeat(nvbench::state& state) { - CUDF_EXPECTS(n_cols == 1 || n_cols == 2, "Invalid number of columns."); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const min_repeat = static_cast(state.get_int64("min_repeat")); + auto const max_repeat = static_cast(state.get_int64("max_repeat")); + auto const api = state.get_string("api"); - std::vector dtype_ids{cudf::type_id::STRING}; auto builder = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - - if (n_cols == 2) { - dtype_ids.push_back(cudf::type_id::INT32); - builder.distribution( - cudf::type_id::INT32, distribution_id::NORMAL, min_repeat_times, max_repeat_times); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + builder.distribution(cudf::type_id::INT32, distribution_id::NORMAL, min_repeat, max_repeat); + + auto const table = create_random_table( + {cudf::type_id::STRING, cudf::type_id::INT32}, row_count{num_rows}, data_profile{builder}); + auto const input = cudf::strings_column_view(table->view().column(0)); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); + + if (api == "scalar") { + state.add_global_memory_writes(chars_size * max_repeat); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::repeat_strings(input, max_repeat); }); + } else if (api == "column") { + auto repeats = table->view().column(1); + { + auto result = cudf::strings::repeat_strings(input, repeats); + auto output = cudf::strings_column_view(result->view()); + state.add_global_memory_writes(output.chars_size(stream)); + } + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::repeat_strings(input, repeats); }); } - - return create_random_table(dtype_ids, row_count{n_rows}, data_profile{builder}); } -static void BM_repeat_strings_scalar_times(benchmark::State& state) -{ - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); - auto const table = create_data_table(1, n_rows, max_str_length); - auto const strings_col = cudf::strings_column_view(table->view().column(0)); - - for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::repeat_strings(strings_col, default_repeat_times); - } - - state.SetBytesProcessed(state.iterations() * strings_col.chars_size(cudf::get_default_stream())); -} - -static void BM_repeat_strings_column_times(benchmark::State& state) -{ - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); - auto const table = create_data_table(2, n_rows, max_str_length); - auto const strings_col = cudf::strings_column_view(table->view().column(0)); - auto const repeat_times_col = table->view().column(1); - - for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::repeat_strings(strings_col, repeat_times_col); - } - - state.SetBytesProcessed(state.iterations() * (strings_col.chars_size(cudf::get_default_stream()) + - repeat_times_col.size() * sizeof(int32_t))); -} - -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 8; - int const max_rows = 1 << 18; - int const row_mult = 4; - int const min_strlen = 1 << 4; - int const max_strlen = 1 << 8; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_strlen, max_strlen, len_mult); -} - -class RepeatStrings : public cudf::benchmark {}; - -#define REPEAT_STRINGS_SCALAR_TIMES_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RepeatStrings, name) \ - (::benchmark::State & st) { BM_repeat_strings_scalar_times(st); } \ - BENCHMARK_REGISTER_F(RepeatStrings, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -#define REPEAT_STRINGS_COLUMN_TIMES_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RepeatStrings, name) \ - (::benchmark::State & st) { BM_repeat_strings_column_times(st); } \ - BENCHMARK_REGISTER_F(RepeatStrings, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -REPEAT_STRINGS_SCALAR_TIMES_BENCHMARK_DEFINE(scalar_times) -REPEAT_STRINGS_COLUMN_TIMES_BENCHMARK_DEFINE(column_times) +NVBENCH_BENCH(bench_repeat) + .set_name("repeat") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("min_repeat", {0}) + .add_int64_axis("max_repeat", {16}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_string_axis("api", {"scalar", "column"}); diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp index 3d9d51bfd6d..643e857f356 100644 --- a/cpp/benchmarks/string/replace.cpp +++ b/cpp/benchmarks/string/replace.cpp @@ -14,11 +14,8 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include #include -#include #include @@ -27,59 +24,51 @@ #include #include -#include - -class StringReplace : public cudf::benchmark {}; +#include enum replace_type { scalar, slice, multi }; -static void BM_replace(benchmark::State& state, replace_type rt) +static void bench_replace(nvbench::state& state) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const api = state.get_string("api"); + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); - cudf::strings_column_view input(column->view()); - cudf::string_scalar target("+"); - cudf::string_scalar repl(""); - cudf::test::strings_column_wrapper targets({"+", "-"}); - cudf::test::strings_column_wrapper repls({"", ""}); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (rt) { - case scalar: cudf::strings::replace(input, target, repl); break; - case slice: cudf::strings::replace_slice(input, repl, 1, 10); break; - case multi: - cudf::strings::replace_multiple( - input, cudf::strings_column_view(targets), cudf::strings_column_view(repls)); - break; - } - } + cudf::strings_column_view input(column->view()); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 2; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + if (api == "scalar") { + cudf::string_scalar target("+"); + cudf::string_scalar repl("-"); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::replace(input, target, repl); }); + } else if (api == "multi") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::test::strings_column_wrapper targets({"+", " "}); + cudf::test::strings_column_wrapper repls({"-", "_"}); + cudf::strings::replace_multiple( + input, cudf::strings_column_view(targets), cudf::strings_column_view(repls)); + }); + } else if (api == "slice") { + cudf::string_scalar repl("0123456789"); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::replace_slice(input, repl, 1, 10); }); + } } -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringReplace, name) \ - (::benchmark::State & st) { BM_replace(st, replace_type::name); } \ - BENCHMARK_REGISTER_F(StringReplace, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(scalar) -STRINGS_BENCHMARK_DEFINE(slice) -STRINGS_BENCHMARK_DEFINE(multi) +NVBENCH_BENCH(bench_replace) + .set_name("replace") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_string_axis("api", {"scalar", "multi", "slice"}); diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp index dc3c8c71488..020ab3ca965 100644 --- a/cpp/benchmarks/string/translate.cpp +++ b/cpp/benchmarks/string/translate.cpp @@ -14,13 +14,7 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include -#include -#include - -#include #include #include @@ -28,20 +22,24 @@ #include -#include +#include -class StringTranslate : public cudf::benchmark {}; +#include +#include using entry_type = std::pair; -static void BM_translate(benchmark::State& state, int entry_count) +static void bench_translate(nvbench::state& state) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const entry_count = static_cast(state.get_int64("entries")); + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); - cudf::strings_column_view input(column->view()); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const input = cudf::strings_column_view(column->view()); std::vector entries(entry_count); std::transform(thrust::counting_iterator(0), @@ -51,33 +49,19 @@ static void BM_translate(benchmark::State& state, int entry_count) return entry_type{'!' + idx, '~' - idx}; }); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::translate(input, entries); - } + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::translate(input, entries); }); } -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); -} - -#define STRINGS_BENCHMARK_DEFINE(name, entries) \ - BENCHMARK_DEFINE_F(StringTranslate, name) \ - (::benchmark::State & st) { BM_translate(st, entries); } \ - BENCHMARK_REGISTER_F(StringTranslate, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(translate_small, 5) -STRINGS_BENCHMARK_DEFINE(translate_medium, 25) -STRINGS_BENCHMARK_DEFINE(translate_large, 50) +NVBENCH_BENCH(bench_translate) + .set_name("translate") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("entries", {5, 25, 50}); diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu index 7720e585023..cee2a246838 100644 --- a/cpp/benchmarks/string/url_decode.cu +++ b/cpp/benchmarks/string/url_decode.cu @@ -15,48 +15,40 @@ */ #include -#include - -#include +#include +#include #include -#include #include +#include #include #include -#include #include #include #include #include -#include #include -#include + +#include struct url_string_generator { - char* chars; + cudf::column_device_view d_strings; double esc_seq_chance; thrust::minstd_rand engine; - thrust::uniform_real_distribution esc_seq_dist; - url_string_generator(char* c, double esc_seq_chance, thrust::minstd_rand& engine) - : chars(c), esc_seq_chance(esc_seq_chance), engine(engine), esc_seq_dist(0, 1) - { - } + thrust::uniform_real_distribution esc_seq_dist{0, 1}; - __device__ void operator()(thrust::tuple str_begin_end) + __device__ void operator()(cudf::size_type idx) { - auto begin = thrust::get<0>(str_begin_end); - auto end = thrust::get<1>(str_begin_end); - engine.discard(begin); - for (auto i = begin; i < end; ++i) { - if (esc_seq_dist(engine) < esc_seq_chance and i < end - 3) { + engine.discard(idx); + auto d_str = d_strings.element(idx); + auto chars = const_cast(d_str.data()); + for (auto i = 0; i < d_str.size_bytes() - 3; ++i) { + if (esc_seq_dist(engine) < esc_seq_chance) { chars[i] = '%'; chars[i + 1] = '2'; chars[i + 2] = '0'; i += 2; - } else { - chars[i] = 'a'; } } } @@ -64,50 +56,44 @@ struct url_string_generator { auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, double esc_seq_chance) { - std::vector strings{std::string(chars_per_row, 'a')}; - auto col_1a = cudf::test::strings_column_wrapper(strings.begin(), strings.end()); - auto table_a = cudf::repeat(cudf::table_view{{col_1a}}, num_rows); - auto result_col = std::move(table_a->release()[0]); // string column with num_rows aaa... - auto chars_data = static_cast(result_col->mutable_view().head()); - auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view(); - auto offset_itr = cudf::detail::offsetalator_factory::make_input_iterator(offset_col); + auto str_row = std::string(chars_per_row, 'a'); + auto result_col = cudf::make_column_from_scalar(cudf::string_scalar(str_row), num_rows); + auto d_strings = cudf::column_device_view::create(result_col->view()); auto engine = thrust::default_random_engine{}; thrust::for_each_n(thrust::device, - thrust::make_zip_iterator(offset_itr, offset_itr + 1), + thrust::counting_iterator(0), num_rows, - url_string_generator{chars_data, esc_seq_chance, engine}); + url_string_generator{*d_strings, esc_seq_chance, engine}); return result_col; } -class UrlDecode : public cudf::benchmark {}; - -void BM_url_decode(benchmark::State& state, int esc_seq_pct) +static void bench_url_decode(nvbench::state& state) { - cudf::size_type const num_rows = state.range(0); - cudf::size_type const chars_per_row = state.range(1); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const esc_seq_pct = static_cast(state.get_int64("esc_seq_pct")); + + auto column = generate_column(num_rows, row_width, esc_seq_pct / 100.0); + auto input = cudf::strings_column_view(column->view()); - auto column = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0); - auto strings_view = cudf::strings_column_view(column->view()); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::strings::url_decode(strings_view); + { + auto result = cudf::strings::url_decode(input); + auto sv = cudf::strings_column_view(result->view()); + state.add_global_memory_writes(sv.chars_size(stream)); } - state.SetBytesProcessed(state.iterations() * num_rows * - (chars_per_row + sizeof(cudf::size_type))); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::url_decode(input); }); } -#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ - BENCHMARK_DEFINE_F(UrlDecode, esc_seq_pct) \ - (::benchmark::State & st) { BM_url_decode(st, esc_seq_pct); } \ - BENCHMARK_REGISTER_F(UrlDecode, esc_seq_pct) \ - ->Args({100000000, 10}) \ - ->Args({10000000, 100}) \ - ->Args({1000000, 1000}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -URLD_BENCHMARK_DEFINE(10) -URLD_BENCHMARK_DEFINE(50) +NVBENCH_BENCH(bench_url_decode) + .set_name("url_decode") + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("esc_seq_pct", {10, 50}); diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 31ce60d8f9a..a80d0dcbdb8 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -20,8 +20,6 @@ #include -#include - #include static void bench_minhash(nvbench::state& state) @@ -29,26 +27,25 @@ static void bench_minhash(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const parameters = static_cast(state.get_int64("parameters")); auto const base64 = state.get_int64("hash_type") == 64; - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); - data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - seeds.set_null_mask(rmm::device_buffer{}, 0); + data_profile const param_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), + distribution_id::NORMAL, + 0u, + std::numeric_limits::max()); + auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const param_table = + create_random_table({param_type, param_type}, row_count{parameters}, param_profile); + auto const parameters_a = param_table->view().column(0); + auto const parameters_b = param_table->view().column(1); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) - : nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 + ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); }); } NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {1024, 8192, 16364, 131072}) - .add_int64_axis("row_width", {128, 512, 2048}) - .add_int64_axis("hash_width", {5, 10}) - .add_int64_axis("seed_count", {2, 26}) + .add_int64_axis("num_rows", {15000, 30000, 60000}) + .add_int64_axis("row_width", {6000, 28000, 50000}) + .add_int64_axis("hash_width", {12, 24}) + .add_int64_axis("parameters", {26, 260}) .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index 8e48f8e9a05..43d57201b20 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -15,58 +15,45 @@ */ #include -#include -#include -#include #include #include #include -class TextNGrams : public cudf::benchmark {}; +#include -enum class ngrams_type { tokens, characters }; - -static void BM_ngrams(benchmark::State& state, ngrams_type nt) +static void bench_ngrams(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const ngram_type = state.get_string("type"); + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto const separator = cudf::string_scalar("_"); - for (auto _ : state) { - cuda_event_timer raii(state, true); - switch (nt) { - case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break; - case ngrams_type::characters: nvtext::generate_character_ngrams(input); break; - } - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size * 2); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 5; - int const max_rowlen = 40; - int const len_mult = 2; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + if (ngram_type == "chars") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::generate_character_ngrams(input); + }); + } else { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::generate_ngrams(input, 2, separator); + }); + } } -#define NVTEXT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(TextNGrams, name) \ - (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \ - BENCHMARK_REGISTER_F(TextNGrams, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -NVTEXT_BENCHMARK_DEFINE(tokens) -NVTEXT_BENCHMARK_DEFINE(characters) +NVBENCH_BENCH(bench_ngrams) + .set_name("ngrams") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578}) + .add_int64_axis("row_width", {10, 20, 40, 100}) + .add_string_axis("type", {"chars", "tokens"}); diff --git a/cpp/cmake/Modules/FindCUDAToolkit.cmake b/cpp/cmake/Modules/FindCUDAToolkit.cmake new file mode 100644 index 00000000000..6f0272aa2d7 --- /dev/null +++ b/cpp/cmake/Modules/FindCUDAToolkit.cmake @@ -0,0 +1,1437 @@ +# CMake - Cross Platform Makefile Generator +# Copyright 2000-2024 Kitware, Inc. and Contributors +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Kitware, Inc. nor the names of Contributors +# may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#[=======================================================================[.rst: +FindCUDAToolkit +--------------- + +.. versionadded:: 3.17 + +This script locates the NVIDIA CUDA toolkit and the associated libraries, but +does not require the ``CUDA`` language be enabled for a given project. This +module does not search for the NVIDIA CUDA Samples. + +.. versionadded:: 3.19 + QNX support. + +Search Behavior +^^^^^^^^^^^^^^^ + +The CUDA Toolkit search behavior uses the following order: + +1. If the ``CUDA`` language has been enabled we will use the directory + containing the compiler as the first search location for ``nvcc``. + +2. If the variable :variable:`CMAKE_CUDA_COMPILER _COMPILER>` or + the environment variable :envvar:`CUDACXX` is defined, it will be used + as the path to the ``nvcc`` executable. + +3. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g., + ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it + will be searched. If both an environment variable **and** a + configuration variable are specified, the *configuration* variable takes + precedence. + + The directory specified here must be such that the executable ``nvcc`` or + the appropriate ``version.txt`` or ``version.json`` file can be found + underneath the specified directory. + +4. If the CUDA_PATH environment variable is defined, it will be searched + for ``nvcc``. + +5. The user's path is searched for ``nvcc`` using :command:`find_program`. If + this is found, no subsequent search attempts are performed. Users are + responsible for ensuring that the first ``nvcc`` to show up in the path is + the desired path in the event that multiple CUDA Toolkits are installed. + +6. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is + used. No subsequent search attempts are performed. No default symbolic link + location exists for the Windows platform. + +7. The platform specific default install locations are searched. If exactly one + candidate is found, this is used. The default CUDA Toolkit install locations + searched are: + + +-------------+-------------------------------------------------------------+ + | Platform | Search Pattern | + +=============+=============================================================+ + | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Other Unix | ``/usr/local/cuda-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Windows | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` | + +-------------+-------------------------------------------------------------+ + + Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as + ``/usr/local/cuda-9.0`` or + ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`` + + .. note:: + + When multiple CUDA Toolkits are installed in the default location of a + system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0`` + exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this + package is marked as **not** found. + + There are too many factors involved in making an automatic decision in + the presence of multiple CUDA Toolkits being installed. In this + situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or + (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for + :command:`find_program` to find. + +Arguments +^^^^^^^^^ + +``[]`` + The ``[]`` argument requests a version with which the package found + should be compatible. See :ref:`find_package version format ` + for more details. + +Options +^^^^^^^ + +``REQUIRED`` + If specified, configuration will error if a suitable CUDA Toolkit is not + found. + +``QUIET`` + If specified, the search for a suitable CUDA Toolkit will not produce any + messages. + +``EXACT`` + If specified, the CUDA Toolkit is considered found only if the exact + ``VERSION`` specified is recovered. + +Imported targets +^^^^^^^^^^^^^^^^ + +An :ref:`imported target ` named ``CUDA::toolkit`` is provided. + +This module defines :prop_tgt:`IMPORTED` targets for each +of the following libraries that are part of the CUDAToolkit: + +- :ref:`CUDA Runtime Library` +- :ref:`CUDA Driver Library` +- :ref:`cuBLAS` +- :ref:`cuDLA` +- :ref:`cuFile` +- :ref:`cuFFT` +- :ref:`cuRAND` +- :ref:`cuSOLVER` +- :ref:`cuSPARSE` +- :ref:`cuPTI` +- :ref:`NPP` +- :ref:`nvBLAS` +- :ref:`nvGRAPH` +- :ref:`nvJPEG` +- :ref:`nvidia-ML` +- :ref:`nvPTX Compiler` +- :ref:`nvRTC` +- :ref:`nvJitLink` +- :ref:`nvFatBin` +- :ref:`nvToolsExt` +- :ref:`nvtx3` +- :ref:`OpenCL` +- :ref:`cuLIBOS` + +.. _`cuda_toolkit_rt_lib`: + +CUDA Runtime Library +"""""""""""""""""""" + +The CUDA Runtime library (cudart) are what most applications will typically +need to link against to make any calls such as `cudaMalloc`, and `cudaFree`. + +Targets Created: + +- ``CUDA::cudart`` +- ``CUDA::cudart_static`` + +.. _`cuda_toolkit_driver_lib`: + +CUDA Driver Library +"""""""""""""""""""" + +The CUDA Driver library (cuda) are used by applications that use calls +such as `cuMemAlloc`, and `cuMemFree`. + +Targets Created: + +- ``CUDA::cuda_driver`` + +.. _`cuda_toolkit_cuBLAS`: + +cuBLAS +"""""" + +The `cuBLAS `_ library. + +Targets Created: + +- ``CUDA::cublas`` +- ``CUDA::cublas_static`` +- ``CUDA::cublasLt`` starting in CUDA 10.1 +- ``CUDA::cublasLt_static`` starting in CUDA 10.1 + +.. _`cuda_toolkit_cuDLA`: + +cuDLA +"""""" + +.. versionadded:: 3.27 + +The NVIDIA Tegra Deep Learning Accelerator `cuDLA `_ library. + +Targets Created: + +- ``CUDA::cudla`` starting in CUDA 11.6 + +.. _`cuda_toolkit_cuFile`: + +cuFile +"""""" + +.. versionadded:: 3.25 + +The NVIDIA GPUDirect Storage `cuFile `_ library. + +Targets Created: + +- ``CUDA::cuFile`` starting in CUDA 11.4 +- ``CUDA::cuFile_static`` starting in CUDA 11.4 +- ``CUDA::cuFile_rdma`` starting in CUDA 11.4 +- ``CUDA::cuFile_rdma_static`` starting in CUDA 11.4 + +.. _`cuda_toolkit_cuFFT`: + +cuFFT +""""" + +The `cuFFT `_ library. + +Targets Created: + +- ``CUDA::cufft`` +- ``CUDA::cufftw`` +- ``CUDA::cufft_static`` +- ``CUDA::cufft_static_nocallback`` starting in CUDA 9.2, requires CMake 3.23+ +- ``CUDA::cufftw_static`` + +cuRAND +"""""" + +The `cuRAND `_ library. + +Targets Created: + +- ``CUDA::curand`` +- ``CUDA::curand_static`` + +.. _`cuda_toolkit_cuSOLVER`: + +cuSOLVER +"""""""" + +The `cuSOLVER `_ library. + +Targets Created: + +- ``CUDA::cusolver`` +- ``CUDA::cusolver_static`` + +.. _`cuda_toolkit_cuSPARSE`: + +cuSPARSE +"""""""" + +The `cuSPARSE `_ library. + +Targets Created: + +- ``CUDA::cusparse`` +- ``CUDA::cusparse_static`` + +.. _`cuda_toolkit_cupti`: + +cupti +""""" + +The `NVIDIA CUDA Profiling Tools Interface `_. + +Targets Created: + +- ``CUDA::cupti`` +- ``CUDA::cupti_static`` + +.. versionadded:: 3.27 + + - ``CUDA::nvperf_host`` starting in CUDA 10.2 + - ``CUDA::nvperf_host_static`` starting in CUDA 10.2 + - ``CUDA::nvperf_target`` starting in CUDA 10.2 + - ``CUDA::pcsamplingutil`` starting in CUDA 11.3 + +.. _`cuda_toolkit_NPP`: + +NPP +""" + +The `NPP `_ libraries. + +Targets Created: + +- `nppc`: + + - ``CUDA::nppc`` + - ``CUDA::nppc_static`` + +- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h` + + - ``CUDA::nppial`` + - ``CUDA::nppial_static`` + +- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h` + + - ``CUDA::nppicc`` + - ``CUDA::nppicc_static`` + +- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h` + Removed starting in CUDA 11.0, use :ref:`nvJPEG` instead. + + - ``CUDA::nppicom`` + - ``CUDA::nppicom_static`` + +- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h` + + - ``CUDA::nppidei`` + - ``CUDA::nppidei_static`` + +- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h` + + - ``CUDA::nppif`` + - ``CUDA::nppif_static`` + +- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h` + + - ``CUDA::nppig`` + - ``CUDA::nppig_static`` + +- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h` + + - ``CUDA::nppim`` + - ``CUDA::nppim_static`` + +- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h` + + - ``CUDA::nppist`` + - ``CUDA::nppist_static`` + +- `nppisu`: Memory support functions in `nppi_support_functions.h` + + - ``CUDA::nppisu`` + - ``CUDA::nppisu_static`` + +- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h` + + - ``CUDA::nppitc`` + - ``CUDA::nppitc_static`` + +- `npps`: + + - ``CUDA::npps`` + - ``CUDA::npps_static`` + +.. _`cuda_toolkit_nvBLAS`: + +nvBLAS +"""""" + +The `nvBLAS `_ libraries. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvblas`` + +.. _`cuda_toolkit_nvGRAPH`: + +nvGRAPH +""""""" + +The `nvGRAPH `_ library. +Removed starting in CUDA 11.0 + +Targets Created: + +- ``CUDA::nvgraph`` +- ``CUDA::nvgraph_static`` + + +.. _`cuda_toolkit_nvJPEG`: + +nvJPEG +"""""" + +The `nvJPEG `_ library. +Introduced in CUDA 10. + +Targets Created: + +- ``CUDA::nvjpeg`` +- ``CUDA::nvjpeg_static`` + +.. _`cuda_toolkit_nvPTX`: + +nvPTX Compiler +"""""""""""""" + +.. versionadded:: 3.25 + +The `nvPTX `_ (PTX Compilation) library. +The PTX Compiler APIs are a set of APIs which can be used to compile a PTX program into GPU assembly code. +Introduced in CUDA 11.1 +This is a static library only. + +Targets Created: + +- ``CUDA::nvptxcompiler_static`` starting in CUDA 11.1 + +.. _`cuda_toolkit_nvRTC`: + +nvRTC +""""" + +The `nvRTC `_ (Runtime Compilation) library. + +Targets Created: + +- ``CUDA::nvrtc`` + +.. versionadded:: 3.26 + + - ``CUDA::nvrtc_builtins`` + - ``CUDA::nvrtc_static`` starting in CUDA 11.5 + - ``CUDA::nvrtc_builtins_static`` starting in CUDA 11.5 + +.. _`cuda_toolkit_nvjitlink`: + +nvJitLink +""""""""" + +The `nvJItLink `_ (Runtime LTO Linking) library. + +Targets Created: + +- ``CUDA::nvJitLink`` starting in CUDA 12.0 +- ``CUDA::nvJitLink_static`` starting in CUDA 12.0 + +.. _`cuda_toolkit_nvfatbin`: + +nvFatBin +""""""""" + +.. versionadded:: 3.30 + +The `nvFatBin `_ (Runtime fatbin creation) library. + +Targets Created: + +- ``CUDA::nvfatbin`` starting in CUDA 12.4 +- ``CUDA::nvfatbin_static`` starting in CUDA 12.4 + +.. _`cuda_toolkit_nvml`: + +nvidia-ML +""""""""" + +The `NVIDIA Management Library `_. + +Targets Created: + +- ``CUDA::nvml`` +- ``CUDA::nvml_static`` starting in CUDA 12.4 + +.. versionadded:: 3.31 + Added ``CUDA::nvml_static``. + +.. _`cuda_toolkit_nvToolsExt`: + +nvToolsExt +"""""""""" + +.. deprecated:: 3.25 With CUDA 10.0+, use :ref:`nvtx3 `. + +The `NVIDIA Tools Extension `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvToolsExt`` + +.. _`cuda_toolkit_nvtx3`: + +nvtx3 +""""" + +.. versionadded:: 3.25 + +The header-only `NVIDIA Tools Extension Library `_. +Introduced in CUDA 10.0. + +Targets created: + +- ``CUDA::nvtx3`` + +.. _`cuda_toolkit_opencl`: + +OpenCL +"""""" + +The `NVIDIA OpenCL Library `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::OpenCL`` + +.. _`cuda_toolkit_cuLIBOS`: + +cuLIBOS +""""""" + +The cuLIBOS library is a backend thread abstraction layer library which is +static only. The ``CUDA::cublas_static``, ``CUDA::cusparse_static``, +``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP +libraries all automatically have this dependency linked. + +Target Created: + +- ``CUDA::culibos`` + +**Note**: direct usage of this target by consumers should not be necessary. + +.. _`cuda_toolkit_cuRAND`: + + + +Result variables +^^^^^^^^^^^^^^^^ + +``CUDAToolkit_FOUND`` + A boolean specifying whether or not the CUDA Toolkit was found. + +``CUDAToolkit_VERSION`` + The exact version of the CUDA Toolkit found (as reported by + ``nvcc --version``, ``version.txt``, or ``version.json``). + +``CUDAToolkit_VERSION_MAJOR`` + The major version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_MINOR`` + The minor version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_PATCH`` + The patch version of the CUDA Toolkit. + +``CUDAToolkit_BIN_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + executable ``nvcc``. + +``CUDAToolkit_INCLUDE_DIRS`` + List of paths to all the CUDA Toolkit folders containing header files + required to compile a project linking against CUDA. + +``CUDAToolkit_LIBRARY_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + Runtime library ``cudart``. + +``CUDAToolkit_LIBRARY_ROOT`` + .. versionadded:: 3.18 + + The path to the CUDA Toolkit directory containing the nvvm directory and + either version.txt or version.json. + +``CUDAToolkit_TARGET_DIR`` + The path to the CUDA Toolkit directory including the target architecture + when cross-compiling. When not cross-compiling this will be equivalent to + the parent directory of ``CUDAToolkit_BIN_DIR``. + +``CUDAToolkit_NVCC_EXECUTABLE`` + The path to the NVIDIA CUDA compiler ``nvcc``. Note that this path may + **not** be the same as + :variable:`CMAKE_CUDA_COMPILER _COMPILER>`. ``nvcc`` must be + found to determine the CUDA Toolkit version as well as determining other + features of the Toolkit. This variable is set for the convenience of + modules that depend on this one. + + +#]=======================================================================] + +# NOTE: much of this was simply extracted from FindCUDA.cmake. + +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# Copyright (c) 2007-2009 +# Scientific Computing and Imaging Institute, University of Utah +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +function(_CUDAToolkit_build_include_dirs result_variable default_paths_variable) + set(content "${${default_paths_variable}}") + set(${result_variable} "${content}" PARENT_SCOPE) +endfunction() + +function(_CUDAToolkit_build_library_dirs result_variable default_paths_variable) + set(content "${${default_paths_variable}}") + set(${result_variable} "${content}" PARENT_SCOPE) +endfunction() + +# The toolkit is located during compiler detection for CUDA and stored in CMakeCUDACompiler.cmake as +# - CMAKE_CUDA_COMPILER_TOOLKIT_ROOT +# - CMAKE_CUDA_COMPILER_LIBRARY_ROOT +# - CMAKE_CUDA_COMPILER_LIBRARY_DIRECTORIES_FROM_IMPLICIT_LIBRARIES +# - CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES +# We compute the rest based on those here to avoid re-searching and to avoid finding a possibly +# different installation. +if(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT) + set(CUDAToolkit_ROOT_DIR "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}") + set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}") + _CUDAToolkit_build_library_dirs(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES) + _CUDAToolkit_build_include_dirs(CUDAToolkit_INCLUDE_DIRECTORIES CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES) + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_ROOT_DIR}/bin") + set(CUDAToolkit_NVCC_EXECUTABLE "${CUDAToolkit_BIN_DIR}/nvcc${CMAKE_EXECUTABLE_SUFFIX}") + set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}") + + if(CUDAToolkit_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + endif() +else() + function(_CUDAToolkit_find_root_dir ) + cmake_parse_arguments(arg "COMPILER_PATHS" "" "SEARCH_PATHS;FIND_FLAGS" ${ARGN}) + + if(NOT CUDAToolkit_BIN_DIR) + if(arg_COMPILER_PATHS) + # need to find parent dir, since this could clang and not nvcc + if(EXISTS "${CMAKE_CUDA_COMPILER}") + get_filename_component(possible_nvcc_path "${CMAKE_CUDA_COMPILER}" PROGRAM PROGRAM_ARGS CUDAToolkit_compiler_args) + get_filename_component(possible_nvcc_path "${possible_nvcc_path}" DIRECTORY) + elseif(EXISTS "$ENV{CUDACXX}") + get_filename_component(possible_nvcc_path "$ENV{CUDACXX}" PROGRAM PROGRAM_ARGS CUDAToolkit_compiler_args) + get_filename_component(possible_nvcc_path "${possible_nvcc_path}" DIRECTORY) + endif() + if(possible_nvcc_path) + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + NO_DEFAULT_PATH + PATHS ${possible_nvcc_path} + ) + endif() + endif() + + if(NOT CUDAToolkit_SENTINEL_FILE) + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ${arg_SEARCH_PATHS} + ${arg_FIND_FLAGS} + ) + endif() + + if(NOT CUDAToolkit_NVCC_EXECUTABLE) + find_file(CUDAToolkit_SENTINEL_FILE + NAMES version.txt version.json + PATHS ${arg_SEARCH_PATHS} + NO_DEFAULT_PATH + ) + endif() + + if(EXISTS "${CUDAToolkit_NVCC_EXECUTABLE}") + # If NVCC exists then invoke it to find the toolkit location. + # This allows us to support wrapper scripts (e.g. ccache or colornvcc), CUDA Toolkit, + # NVIDIA HPC SDK, and distro's splayed layouts + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "-v" "__cmake_determine_cuda" + OUTPUT_VARIABLE _CUDA_NVCC_OUT ERROR_VARIABLE _CUDA_NVCC_OUT) + message(CONFIGURE_LOG + "Executed nvcc to extract CUDAToolkit information:\n${_CUDA_NVCC_OUT}\n\n") + if(_CUDA_NVCC_OUT MATCHES "\\#\\$ TOP=([^\r\n]*)") + get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_MATCH_1}/bin" ABSOLUTE) + message(CONFIGURE_LOG + "Parsed CUDAToolkit nvcc location:\n${CUDAToolkit_BIN_DIR}\n\n") + else() + get_filename_component(CUDAToolkit_BIN_DIR "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY) + endif() + if(_CUDA_NVCC_OUT MATCHES "\\#\\$ INCLUDES=([^\r\n]*)") + separate_arguments(_nvcc_output NATIVE_COMMAND "${CMAKE_MATCH_1}") + foreach(line IN LISTS _nvcc_output) + string(REGEX REPLACE "^-I" "" line "${line}") + get_filename_component(line "${line}" ABSOLUTE) + list(APPEND _cmake_CUDAToolkit_include_directories "${line}") + endforeach() + message(CONFIGURE_LOG + "Parsed CUDAToolkit nvcc implicit include information:\n${_cmake_CUDAToolkit_include_directories}\n\n") + + set(_cmake_CUDAToolkit_include_directories "${_cmake_CUDAToolkit_include_directories}" CACHE INTERNAL "CUDAToolkit internal list of include directories") + endif() + if(_CUDA_NVCC_OUT MATCHES "\\#\\$ LIBRARIES=([^\r\n]*)") + include(${CMAKE_ROOT}/Modules/CMakeParseImplicitLinkInfo.cmake) + set(_nvcc_link_line "cuda-fake-ld ${CMAKE_MATCH_1}") + CMAKE_PARSE_IMPLICIT_LINK_INFO("${_nvcc_link_line}" + _cmake_CUDAToolkit_implicit_link_libs + _cmake_CUDAToolkit_implicit_link_directories + _cmake_CUDAToolkit_implicit_frameworks + _nvcc_log + "${CMAKE_CUDA_IMPLICIT_OBJECT_REGEX}" + LANGUAGE CUDA) + message(CONFIGURE_LOG + "Parsed CUDAToolkit nvcc implicit link information:\n${_nvcc_log}\n${_cmake_CUDAToolkit_implicit_link_directories}\n\n") + unset(_nvcc_link_line) + unset(_cmake_CUDAToolkit_implicit_link_libs) + unset(_cmake_CUDAToolkit_implicit_frameworks) + + set(_cmake_CUDAToolkit_implicit_link_directories "${_cmake_CUDAToolkit_implicit_link_directories}" CACHE INTERNAL "CUDAToolkit internal list of implicit link directories") + endif() + unset(_CUDA_NVCC_OUT) + + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE) + mark_as_advanced(CUDAToolkit_BIN_DIR) + endif() + + if(CUDAToolkit_SENTINEL_FILE) + get_filename_component(CUDAToolkit_BIN_DIR ${CUDAToolkit_SENTINEL_FILE} DIRECTORY ABSOLUTE) + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}/bin") + + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE) + mark_as_advanced(CUDAToolkit_BIN_DIR) + endif() + endif() + + if(DEFINED _cmake_CUDAToolkit_include_directories) + _CUDAToolkit_build_include_dirs(_cmake_CUDAToolkit_contents _cmake_CUDAToolkit_include_directories) + set(CUDAToolkit_INCLUDE_DIRECTORIES "${_cmake_CUDAToolkit_contents}" PARENT_SCOPE) + endif() + if(DEFINED _cmake_CUDAToolkit_implicit_link_directories) + _CUDAToolkit_build_library_dirs(_cmake_CUDAToolkit_contents _cmake_CUDAToolkit_implicit_link_directories) + set(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES "${_cmake_CUDAToolkit_contents}" PARENT_SCOPE) + endif() + + if(CUDAToolkit_BIN_DIR) + get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) + set(CUDAToolkit_ROOT_DIR "${CUDAToolkit_ROOT_DIR}" PARENT_SCOPE) + endif() + + endfunction() + + function(_CUDAToolkit_find_version_file result_variable) + # We first check for a non-scattered installation to prefer it over a scattered installation. + set(version_files version.txt version.json) + foreach(vf IN LISTS version_files) + if(CUDAToolkit_ROOT AND EXISTS "${CUDAToolkit_ROOT}/${vf}") + set(${result_variable} "${CUDAToolkit_ROOT}/${vf}" PARENT_SCOPE) + break() + elseif(CUDAToolkit_ROOT_DIR AND EXISTS "${CUDAToolkit_ROOT_DIR}/${vf}") + set(${result_variable} "${CUDAToolkit_ROOT_DIR}/${vf}" PARENT_SCOPE) + break() + elseif(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/${vf}") + set(${result_variable} "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/${vf}" PARENT_SCOPE) + break() + elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/cuda/${vf}") + set(${result_variable} "${CMAKE_SYSROOT}/usr/lib/cuda/${vf}" PARENT_SCOPE) + break() + endif() + endforeach() + endfunction() + + function(_CUDAToolkit_parse_version_file version_file) + if(version_file) + file(READ "${version_file}" file_conents) + cmake_path(GET version_file EXTENSION LAST_ONLY version_ext) + if(version_ext STREQUAL ".json") + string(JSON cuda_version_info GET "${file_conents}" "cuda" "version") + set(cuda_version_match_regex [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + elseif(version_ext STREQUAL ".txt") + set(cuda_version_info "${file_conents}") + set(cuda_version_match_regex [=[CUDA Version ([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + endif() + + if(cuda_version_info MATCHES "${cuda_version_match_regex}") + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}" PARENT_SCOPE) + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}" PARENT_SCOPE) + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}" PARENT_SCOPE) + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" PARENT_SCOPE) + endif() + endif() + endfunction() + + # For NVCC we can easily deduce the SDK binary directory from the compiler path. + if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") + get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_CUDA_COMPILER}" DIRECTORY) + set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "") + # Try language provided path first. + _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_BIN_DIR}" FIND_FLAGS NO_DEFAULT_PATH) + mark_as_advanced(CUDAToolkit_BIN_DIR) + endif() + + # Try user provided path + _CUDAToolkit_find_root_dir(COMPILER_PATHS) + if(NOT CUDAToolkit_ROOT_DIR AND CUDAToolkit_ROOT) + _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_ROOT}" FIND_FLAGS PATH_SUFFIXES bin NO_DEFAULT_PATH) + endif() + if(NOT CUDAToolkit_ROOT_DIR) + _CUDAToolkit_find_root_dir(FIND_FLAGS PATHS ENV CUDA_PATH PATH_SUFFIXES bin) + endif() + + # If the user specified CUDAToolkit_ROOT but the toolkit could not be found, this is an error. + if(NOT CUDAToolkit_ROOT_DIR AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) + # Declare error messages now, print later depending on find_package args. + set(fail_base "Could not find nvcc executable in path specified by") + set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") + set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") + + if(CUDAToolkit_FIND_REQUIRED) + if(DEFINED CUDAToolkit_ROOT) + message(FATAL_ERROR ${cuda_root_fail}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) + message(FATAL_ERROR ${env_cuda_root_fail}) + endif() + else() + if(NOT CUDAToolkit_FIND_QUIETLY) + if(DEFINED CUDAToolkit_ROOT) + message(STATUS ${cuda_root_fail}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) + message(STATUS ${env_cuda_root_fail}) + endif() + endif() + set(CUDAToolkit_FOUND FALSE) + unset(fail_base) + unset(cuda_root_fail) + unset(env_cuda_root_fail) + return() + endif() + endif() + + # CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults. + # + # - Linux: /usr/local/cuda-X.Y + # - macOS: /Developer/NVIDIA/CUDA-X.Y + # - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y + # + # We will also search the default symlink location /usr/local/cuda first since + # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked + # directory is the desired location. + if(NOT CUDAToolkit_ROOT_DIR) + if(UNIX) + if(NOT APPLE) + set(platform_base "/usr/local/cuda-") + else() + set(platform_base "/Developer/NVIDIA/CUDA-") + endif() + else() + set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v") + endif() + + # Build out a descending list of possible cuda installations, e.g. + file(GLOB possible_paths "${platform_base}*") + # Iterate the glob results and create a descending list. + set(versions) + foreach(p ${possible_paths}) + # Extract version number from end of string + string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) + if(IS_DIRECTORY ${p} AND p_version) + list(APPEND versions ${p_version}) + endif() + endforeach() + + # Sort numerically in descending order, so we try the newest versions first. + list(SORT versions COMPARE NATURAL ORDER DESCENDING) + + # With a descending list of versions, populate possible paths to search. + set(search_paths) + foreach(v ${versions}) + list(APPEND search_paths "${platform_base}${v}") + endforeach() + + # Force the global default /usr/local/cuda to the front on Unix. + if(UNIX) + list(INSERT search_paths 0 "/usr/local/cuda") + endif() + + # Now search for the toolkit again using the platform default search paths. + _CUDAToolkit_find_root_dir(SEARCH_PATHS "${search_paths}" FIND_FLAGS PATH_SUFFIXES bin) + + # We are done with these variables now, cleanup for caller. + unset(platform_base) + unset(possible_paths) + unset(versions) + unset(search_paths) + + if(NOT CUDAToolkit_ROOT_DIR) + if(CUDAToolkit_FIND_REQUIRED) + message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") + elseif(NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") + endif() + + set(CUDAToolkit_FOUND FALSE) + return() + endif() + endif() + + _CUDAToolkit_find_version_file( _CUDAToolkit_version_file ) + if(_CUDAToolkit_version_file) + # CUDAToolkit_LIBRARY_ROOT contains the device library and version file. + get_filename_component(CUDAToolkit_LIBRARY_ROOT "${_CUDAToolkit_version_file}" DIRECTORY ABSOLUTE) + endif() + unset(_CUDAToolkit_version_file) + + if(CUDAToolkit_NVCC_EXECUTABLE AND + CMAKE_CUDA_COMPILER_VERSION AND + CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) + # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value + # This if statement will always match, but is used to provide variables for MATCH 1,2,3... + if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}") + endif() + elseif(CUDAToolkit_NVCC_EXECUTABLE) + # Compute the version by invoking nvcc + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + endif() + unset(NVCC_OUT) + else() + _CUDAToolkit_find_version_file(version_file) + _CUDAToolkit_parse_version_file("${version_file}") + endif() +endif() + +# Find target directory when crosscompiling. +if(CMAKE_CROSSCOMPILING) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") + # Support for NVPACK + set(CUDAToolkit_TARGET_NAMES "armv7-linux-androideabi") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") + set(CUDAToolkit_TARGET_NAMES "armv7-linux-gnueabihf") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + if(ANDROID_ARCH_NAME STREQUAL "arm64") + set(CUDAToolkit_TARGET_NAMES "aarch64-linux-androideabi") + elseif (CMAKE_SYSTEM_NAME STREQUAL "QNX") + set(CUDAToolkit_TARGET_NAMES "aarch64-qnx") + else() + set(CUDAToolkit_TARGET_NAMES "aarch64-linux" "sbsa-linux") + endif() + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(CUDAToolkit_TARGET_NAMES "x86_64-linux") + endif() + + foreach(CUDAToolkit_TARGET_NAME IN LISTS CUDAToolkit_TARGET_NAMES) + if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + # add known CUDA target root path to the set of directories we search for programs, libraries and headers + list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") + + # Mark that we need to pop the root search path changes after we have + # found all cuda libraries so that searches for our cross-compilation + # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or + # PATh + set(_CUDAToolkit_Pop_ROOT_PATH True) + break() + endif() + endforeach() +endif() + +# Determine windows search path suffix for libraries +if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows") + if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64") + set(_CUDAToolkit_win_search_dirs lib/x64) + set(_CUDAToolkit_win_stub_search_dirs lib/x64/stubs) + endif() +endif() + +# If not already set we can simply use the toolkit root or it's a scattered installation. +if(NOT CUDAToolkit_TARGET_DIR) + # Not cross compiling + set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}") + # Now that we have the real ROOT_DIR, find components inside it. + list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR}) + + # Mark that we need to pop the prefix path changes after we have + # found the cudart library. + set(_CUDAToolkit_Pop_Prefix True) +endif() + + +# We don't need to verify the cuda_runtime header when we are using `nvcc` include paths +# as the compiler being enabled means the header was found +if(NOT CUDAToolkit_INCLUDE_DIRECTORIES) + # Otherwise use CUDAToolkit_TARGET_DIR to guess where the `cuda_runtime.h` is located + # On a scattered installation /usr, on a non-scattered something like /usr/local/cuda or /usr/local/cuda-10.2/targets/aarch64-linux. + if(EXISTS "${CUDAToolkit_TARGET_DIR}/include/cuda_runtime.h") + set(CUDAToolkit_INCLUDE_DIRECTORIES "${CUDAToolkit_TARGET_DIR}/include") + else() + message(STATUS "Unable to find cuda_runtime.h in \"${CUDAToolkit_TARGET_DIR}/include\" for CUDAToolkit_INCLUDE_DIRECTORIES.") + endif() +endif() + +# The NVHPC layout moves math library headers and libraries to a sibling directory and it could be nested under +# the version of the CUDA toolchain +# Create a separate variable so this directory can be selectively added to math targets. +find_path(CUDAToolkit_CUBLAS_INCLUDE_DIR cublas_v2.h PATHS + ${CUDAToolkit_INCLUDE_DIRECTORIES} + NO_DEFAULT_PATH) + +if(NOT CUDAToolkit_CUBLAS_INCLUDE_DIR) + file(REAL_PATH "${CUDAToolkit_TARGET_DIR}" CUDAToolkit_MATH_INCLUDE_DIR) + cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "../../math_libs/") + if(EXISTS "${CUDAToolkit_MATH_INCLUDE_DIR}/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/") + cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/") + endif() + cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "include") + cmake_path(NORMAL_PATH CUDAToolkit_MATH_INCLUDE_DIR) + + find_path(CUDAToolkit_CUBLAS_INCLUDE_DIR cublas_v2.h PATHS + ${CUDAToolkit_INCLUDE_DIRECTORIES} + ) + if(CUDAToolkit_CUBLAS_INCLUDE_DIR) + list(APPEND CUDAToolkit_INCLUDE_DIRECTORIES "${CUDAToolkit_CUBLAS_INCLUDE_DIR}") + endif() +endif() +unset(CUDAToolkit_CUBLAS_INCLUDE_DIR CACHE) +unset(CUDAToolkit_CUBLAS_INCLUDE_DIR) + +# Find the CUDA Runtime Library libcudart +find_library(CUDA_CUDART + NAMES cudart + PATHS ${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES} + PATH_SUFFIXES lib64 ${_CUDAToolkit_win_search_dirs} +) +find_library(CUDA_CUDART + NAMES cudart + PATHS ${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES} + PATH_SUFFIXES lib64/stubs ${_CUDAToolkit_win_stub_search_dirs} lib/stubs stubs +) + +if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Unable to find cudart library.") +endif() + +if(_CUDAToolkit_Pop_Prefix) + list(REMOVE_AT CMAKE_PREFIX_PATH -1) + unset(_CUDAToolkit_Pop_Prefix) +endif() + +#----------------------------------------------------------------------------- +# Perform version comparison and validate all required variables are set. +include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) +find_package_handle_standard_args(CUDAToolkit + REQUIRED_VARS + CUDAToolkit_INCLUDE_DIRECTORIES + CUDA_CUDART + CUDAToolkit_BIN_DIR + VERSION_VAR + CUDAToolkit_VERSION +) + +unset(CUDAToolkit_ROOT_DIR) +mark_as_advanced(CUDA_CUDART + CUDAToolkit_NVCC_EXECUTABLE + CUDAToolkit_SENTINEL_FILE + ) + +#----------------------------------------------------------------------------- +# Construct result variables +if(CUDAToolkit_FOUND) + set(CUDAToolkit_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRECTORIES}") + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) + + # Build search paths without any symlinks + file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}" _cmake_search_dir) + set(CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}") + + # Detect we are in a splayed nvhpc toolkit layout and add extra + # search paths without symlinks + if(CUDAToolkit_LIBRARY_DIR MATCHES ".*/cuda/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64$") + # Search location for math_libs/ + block(SCOPE_FOR POLICIES) + cmake_policy(SET CMP0152 NEW) + file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}/../../../../../" _cmake_search_dir) + list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}") + + # Search location for extras like cupti + file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}/../../../" _cmake_search_dir) + list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}") + endblock() + endif() + + if(DEFINED CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES) + list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES}") + endif() + + # If no `CUDAToolkit_LIBRARY_ROOT` exists set it based on CUDAToolkit_LIBRARY_DIR + if(NOT DEFINED CUDAToolkit_LIBRARY_ROOT) + foreach(CUDAToolkit_search_loc IN LISTS CUDAToolkit_LIBRARY_DIR CUDAToolkit_BIN_DIR) + get_filename_component(CUDAToolkit_possible_lib_root "${CUDAToolkit_search_loc}" DIRECTORY ABSOLUTE) + if(EXISTS "${CUDAToolkit_possible_lib_root}/nvvm/") + set(CUDAToolkit_LIBRARY_ROOT "${CUDAToolkit_possible_lib_root}") + break() + endif() + endforeach() + unset(CUDAToolkit_search_loc) + unset(CUDAToolkit_possible_lib_root) + endif() +else() + # clear cache results when we fail + unset(_cmake_CUDAToolkit_implicit_link_directories CACHE) + unset(_cmake_CUDAToolkit_include_directories CACHE) + unset(CUDA_CUDART CACHE) + unset(CUDAToolkit_BIN_DIR CACHE) + unset(CUDAToolkit_NVCC_EXECUTABLE CACHE) + unset(CUDAToolkit_SENTINEL_FILE CACHE) +endif() +unset(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES) +unset(CUDAToolkit_INCLUDE_DIRECTORIES) + +#----------------------------------------------------------------------------- +# Construct import targets +if(CUDAToolkit_FOUND) + + function(_CUDAToolkit_find_and_add_import_lib lib_name) + cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES;EXTRA_INCLUDE_DIRS;ONLY_SEARCH_FOR" ${ARGN}) + + if(arg_ONLY_SEARCH_FOR) + set(search_names ${arg_ONLY_SEARCH_FOR}) + else() + set(search_names ${lib_name} ${arg_ALT}) + endif() + + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + HINTS ${CUDAToolkit_LIBRARY_SEARCH_DIRS} + ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 ${_CUDAToolkit_win_search_dirs} lib + # Support NVHPC splayed math library layout + math_libs/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64 + math_libs/lib64 + ${arg_EXTRA_PATH_SUFFIXES} + ) + # Don't try any stub directories until we have exhausted all other + # search locations. + set(CUDA_IMPORT_PROPERTY IMPORTED_LOCATION) + set(CUDA_IMPORT_TYPE UNKNOWN) + if(NOT CUDA_${lib_name}_LIBRARY) + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + HINTS ${CUDAToolkit_LIBRARY_SEARCH_DIRS} + ENV CUDA_PATH + PATH_SUFFIXES lib64/stubs ${_CUDAToolkit_win_stub_search_dirs} lib/stubs stubs + ) + endif() + if(CUDA_${lib_name}_LIBRARY MATCHES "/stubs/" AND NOT CUDA_${lib_name}_LIBRARY MATCHES "\\.a$" AND NOT WIN32) + # Use a SHARED library with IMPORTED_IMPLIB, but not IMPORTED_LOCATION, + # to indicate that the stub is for linkers but not dynamic loaders. + # It will not contribute any RPATH entry. When encountered as + # a private transitive dependency of another shared library, + # it will be passed explicitly to linkers so they can find it + # even when the runtime library file does not exist on disk. + set(CUDA_IMPORT_PROPERTY IMPORTED_IMPLIB) + set(CUDA_IMPORT_TYPE SHARED) + endif() + + mark_as_advanced(CUDA_${lib_name}_LIBRARY) + + if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + add_library(CUDA::${lib_name} ${CUDA_IMPORT_TYPE} IMPORTED) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + if(DEFINED CUDAToolkit_MATH_INCLUDE_DIR) + string(FIND ${CUDA_${lib_name}_LIBRARY} "math_libs" math_libs) + if(NOT ${math_libs} EQUAL -1) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_MATH_INCLUDE_DIR}") + endif() + endif() + set_property(TARGET CUDA::${lib_name} PROPERTY ${CUDA_IMPORT_PROPERTY} "${CUDA_${lib_name}_LIBRARY}") + foreach(dep ${arg_DEPS}) + if(TARGET CUDA::${dep}) + target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) + endif() + endforeach() + if(arg_EXTRA_INCLUDE_DIRS) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${arg_EXTRA_INCLUDE_DIRS}") + endif() + endif() + endfunction() + + if(NOT TARGET CUDA::toolkit) + add_library(CUDA::toolkit IMPORTED INTERFACE) + target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") + endif() + + # setup dependencies that are required for cudart/cudart_static when building + # on linux. These are generally only required when using the CUDA toolkit + # when CUDA language is disabled + if(NOT TARGET CUDA::cudart_static_deps) + add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) + if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER)) + find_package(Threads REQUIRED) + target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) + endif() + + if(UNIX AND NOT APPLE AND NOT (CMAKE_SYSTEM_NAME STREQUAL "QNX")) + # On Linux, you must link against librt when using the static cuda runtime. + find_library(CUDAToolkit_rt_LIBRARY rt) + mark_as_advanced(CUDAToolkit_rt_LIBRARY) + if(NOT CUDAToolkit_rt_LIBRARY) + message(WARNING "Could not find librt library, needed by CUDA::cudart_static") + else() + target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) + endif() + endif() + endif() + + _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda DEPS cudart_static_deps) + _CUDAToolkit_find_and_add_import_lib(cudart DEPS cudart_static_deps) + _CUDAToolkit_find_and_add_import_lib(cudart_static DEPS cudart_static_deps) + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0.0) + _CUDAToolkit_find_and_add_import_lib(nvJitLink) + _CUDAToolkit_find_and_add_import_lib(nvJitLink_static DEPS cudart_static_deps) + endif() + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4.0) + _CUDAToolkit_find_and_add_import_lib(nvfatbin DEPS cudart_static_deps) + _CUDAToolkit_find_and_add_import_lib(nvfatbin_static DEPS cudart_static_deps) + endif() + + _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library + foreach (cuda_lib cublasLt cufft nvjpeg) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS cudart_static_deps culibos) + endforeach() + foreach (cuda_lib curand nppc) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + endforeach() + + _CUDAToolkit_find_and_add_import_lib(cusparse DEPS nvJitLink) + _CUDAToolkit_find_and_add_import_lib(cusparse_static DEPS nvJitLink_static culibos) + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.0.0) + # cublas depends on cublasLt + # https://docs.nvidia.com/cuda/archive/11.0/cublas#static-library + _CUDAToolkit_find_and_add_import_lib(cublas DEPS cublasLt culibos) + _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS cublasLt_static culibos) + else() + _CUDAToolkit_find_and_add_import_lib(cublas DEPS culibos) + _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS culibos) + endif() + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.4) + _CUDAToolkit_find_and_add_import_lib(cuFile ALT cufile DEPS culibos) + _CUDAToolkit_find_and_add_import_lib(cuFile_static ALT cufile_static DEPS culibos) + + _CUDAToolkit_find_and_add_import_lib(cuFile_rdma ALT cufile_rdma DEPS cuFile culibos) + _CUDAToolkit_find_and_add_import_lib(cuFile_rdma_static ALT cufile_rdma_static DEPS cuFile_static culibos) + endif() + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.6) + _CUDAToolkit_find_and_add_import_lib(cudla) + endif() + + + # cuFFTW depends on cuFFT + _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) + _CUDAToolkit_find_and_add_import_lib(cufftw_static DEPS cufft_static) + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 9.2) + _CUDAToolkit_find_and_add_import_lib(cufft_static_nocallback DEPS culibos) + endif() + + # cuSOLVER depends on cuBLAS, and cuSPARSE + set(cusolver_deps cublas cusparse) + set(cusolver_static_deps cublas_static cusparse_static culibos) + if(CUDAToolkit_VERSION VERSION_GREATER 11.2.1) + # cusolver depends on libcusolver_metis and cublasLt + # https://docs.nvidia.com/cuda/archive/11.2.2/cusolver#link-dependency + list(APPEND cusolver_deps cublasLt) + _CUDAToolkit_find_and_add_import_lib(cusolver_metis_static ALT metis_static) # implementation detail static lib + list(APPEND cusolver_static_deps cusolver_metis_static cublasLt_static) + endif() + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.1.2) + # cusolver depends on liblapack_static.a starting with CUDA 10.1 update 2, + # https://docs.nvidia.com/cuda/archive/11.5.0/cusolver#static-link-lapack + _CUDAToolkit_find_and_add_import_lib(cusolver_lapack_static ALT lapack_static) # implementation detail static lib + list(APPEND cusolver_static_deps cusolver_lapack_static) + endif() + _CUDAToolkit_find_and_add_import_lib(cusolver DEPS ${cusolver_deps}) + _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS ${cusolver_static_deps}) + unset(cusolver_deps) + unset(cusolver_static_deps) + + # nvGRAPH depends on cuRAND, and cuSOLVER. + _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + + # Process the majority of the NPP libraries. + foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + endforeach() + + find_path(CUDAToolkit_CUPTI_INCLUDE_DIR cupti.h PATHS + "${CUDAToolkit_ROOT_DIR}/extras/CUPTI/include" + ${CUDAToolkit_INCLUDE_DIRS} + PATH_SUFFIXES "../extras/CUPTI/include" + "../../../extras/CUPTI/include" + NO_DEFAULT_PATH) + mark_as_advanced(CUDAToolkit_CUPTI_INCLUDE_DIR) + + if(CUDAToolkit_CUPTI_INCLUDE_DIR) + set(_cmake_cupti_extra_paths extras/CUPTI/lib64/ + extras/CUPTI/lib/ + ../extras/CUPTI/lib64/ + ../extras/CUPTI/lib/) + _CUDAToolkit_find_and_add_import_lib(cupti + EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths} + EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}") + _CUDAToolkit_find_and_add_import_lib(cupti_static + EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths} + EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}") + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.2.0) + _CUDAToolkit_find_and_add_import_lib(nvperf_host + EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths} + EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}") + _CUDAToolkit_find_and_add_import_lib(nvperf_host_static + EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths} + EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}") + _CUDAToolkit_find_and_add_import_lib(nvperf_target + EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths} + EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}") + endif() + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.3.0) + _CUDAToolkit_find_and_add_import_lib(pcsamplingutil + EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths} + EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}") + endif() + endif() + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1.0) + if(NOT TARGET CUDA::nvptxcompiler_static) + _CUDAToolkit_find_and_add_import_lib(nvptxcompiler_static) + if(TARGET CUDA::nvptxcompiler_static) + target_link_libraries(CUDA::nvptxcompiler_static INTERFACE CUDA::cudart_static_deps) + endif() + endif() + endif() + + _CUDAToolkit_find_and_add_import_lib(nvrtc_builtins ALT nvrtc-builtins) + _CUDAToolkit_find_and_add_import_lib(nvrtc) + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.5.0) + _CUDAToolkit_find_and_add_import_lib(nvrtc_builtins_static ALT nvrtc-builtins_static) + if(NOT TARGET CUDA::nvrtc_static) + _CUDAToolkit_find_and_add_import_lib(nvrtc_static DEPS nvrtc_builtins_static nvptxcompiler_static) + if(TARGET CUDA::nvrtc_static AND WIN32 AND NOT (BORLAND OR MINGW OR CYGWIN)) + target_link_libraries(CUDA::nvrtc_static INTERFACE Ws2_32.lib) + endif() + endif() + endif() + + _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + _CUDAToolkit_find_and_add_import_lib(nvml_static ONLY_SEARCH_FOR libnvidia-ml.a libnvml.a) + + if(WIN32) + # nvtools can be installed outside the CUDA toolkit directory + # so prefer the NVTOOLSEXT_PATH windows only environment variable + # In addition on windows the most common name is nvToolsExt64_1 + find_library(CUDA_nvToolsExt_LIBRARY + NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt + PATHS ENV NVTOOLSEXT_PATH + ENV CUDA_PATH + PATH_SUFFIXES lib/x64 lib + ) + endif() + _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.0) + # nvToolsExt is deprecated since nvtx3 introduction. + # Warn only if the project requires a sufficiently new CMake to make migration possible. + if(TARGET CUDA::nvToolsExt AND CMAKE_MINIMUM_REQUIRED_VERSION VERSION_GREATER_EQUAL 3.25) + set_property(TARGET CUDA::nvToolsExt PROPERTY DEPRECATION "nvToolsExt has been superseded by nvtx3 since CUDA 10.0 and CMake 3.25. Use CUDA::nvtx3 and include instead.") + endif() + + # Header-only variant. Uses dlopen(). + if(NOT TARGET CUDA::nvtx3) + add_library(CUDA::nvtx3 INTERFACE IMPORTED) + target_include_directories(CUDA::nvtx3 SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_libraries(CUDA::nvtx3 INTERFACE ${CMAKE_DL_LIBS}) + endif() + endif() + + _CUDAToolkit_find_and_add_import_lib(OpenCL) +endif() + +if(_CUDAToolkit_Pop_ROOT_PATH) + list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0) + unset(_CUDAToolkit_Pop_ROOT_PATH) +endif() + +unset(_CUDAToolkit_win_search_dirs) +unset(_CUDAToolkit_win_stub_search_dirs) diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake deleted file mode 100644 index 1df4f12d230..00000000000 --- a/cpp/cmake/Modules/FindcuFile.cmake +++ /dev/null @@ -1,120 +0,0 @@ -# ============================================================================= -# Copyright (c) 2020-2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -#[=======================================================================[.rst: -FindcuFile ----------- - -Find cuFile headers and libraries. - -Imported Targets -^^^^^^^^^^^^^^^^ - -``cufile::cuFile`` - The cuFile library, if found. -``cufile::cuFileRDMA`` - The cuFile RDMA library, if found. - -Result Variables -^^^^^^^^^^^^^^^^ - -This will define the following variables in your project: - -``cuFile_FOUND`` - true if (the requested version of) cuFile is available. -``cuFile_VERSION`` - the version of cuFile. -``cuFile_LIBRARIES`` - the libraries to link against to use cuFile. -``cuFileRDMA_LIBRARIES`` - the libraries to link against to use cuFile RDMA. -``cuFile_INCLUDE_DIRS`` - where to find the cuFile headers. -``cuFile_COMPILE_OPTIONS`` - this should be passed to target_compile_options(), if the - target is not used for linking - -#]=======================================================================] - -# use pkg-config to get the directories and then use these values in the FIND_PATH() and -# FIND_LIBRARY() calls -find_package(PkgConfig QUIET) -pkg_check_modules(PKG_cuFile QUIET cuFile) - -set(cuFile_COMPILE_OPTIONS ${PKG_cuFile_CFLAGS_OTHER}) -set(cuFile_VERSION ${PKG_cuFile_VERSION}) - -# Find the location of the CUDA Toolkit -find_package(CUDAToolkit QUIET) -find_path( - cuFile_INCLUDE_DIR - NAMES cufile.h - HINTS ${PKG_cuFile_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIRS} -) - -find_library( - cuFile_LIBRARY - NAMES cufile - HINTS ${PKG_cuFile_LIBRARY_DIRS} ${CUDAToolkit_LIBRARY_DIR} -) - -find_library( - cuFileRDMA_LIBRARY - NAMES cufile_rdma - HINTS ${PKG_cuFile_LIBRARY_DIRS} ${CUDAToolkit_LIBRARY_DIR} -) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - cuFile - FOUND_VAR cuFile_FOUND - REQUIRED_VARS cuFile_LIBRARY cuFileRDMA_LIBRARY cuFile_INCLUDE_DIR - VERSION_VAR cuFile_VERSION -) - -if(cuFile_INCLUDE_DIR AND NOT TARGET cufile::cuFile_interface) - add_library(cufile::cuFile_interface INTERFACE IMPORTED GLOBAL) - target_include_directories( - cufile::cuFile_interface INTERFACE "$" - ) - target_compile_options(cufile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}") - target_compile_definitions(cufile::cuFile_interface INTERFACE CUFILE_FOUND) -endif() - -if(cuFile_FOUND AND NOT TARGET cufile::cuFile) - add_library(cufile::cuFile UNKNOWN IMPORTED GLOBAL) - set_target_properties( - cufile::cuFile - PROPERTIES IMPORTED_LOCATION "${cuFile_LIBRARY}" - INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}" - INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}" - ) -endif() - -if(cuFile_FOUND AND NOT TARGET cufile::cuFileRDMA) - add_library(cufile::cuFileRDMA UNKNOWN IMPORTED GLOBAL) - set_target_properties( - cufile::cuFileRDMA - PROPERTIES IMPORTED_LOCATION "${cuFileRDMA_LIBRARY}" - INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}" - INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}" - ) -endif() - -mark_as_advanced(cuFile_LIBRARY cuFileRDMA_LIBRARY cuFile_INCLUDE_DIR) - -if(cuFile_FOUND) - set(cuFile_LIBRARIES ${cuFile_LIBRARY}) - set(cuFileRDMA_LIBRARIES ${cuFileRDMA_LIBRARY}) - set(cuFile_INCLUDE_DIRS ${cuFile_INCLUDE_DIR}) -endif() diff --git a/cpp/cmake/thirdparty/get_flatbuffers.cmake b/cpp/cmake/thirdparty/get_flatbuffers.cmake index b0ece38b8ef..39521049c85 100644 --- a/cpp/cmake/thirdparty/get_flatbuffers.cmake +++ b/cpp/cmake/thirdparty/get_flatbuffers.cmake @@ -15,13 +15,19 @@ # Use CPM to find or clone flatbuffers function(find_and_configure_flatbuffers VERSION) + if(NOT BUILD_SHARED_LIBS) + set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) + else() + set(_exclude_from_all EXCLUDE_FROM_ALL TRUE) + endif() + rapids_cpm_find( flatbuffers ${VERSION} GLOBAL_TARGETS flatbuffers CPM_ARGS GIT_REPOSITORY https://github.com/google/flatbuffers.git GIT_TAG v${VERSION} - GIT_SHALLOW TRUE + GIT_SHALLOW TRUE ${_exclude_from_all} ) rapids_export_find_package_root( diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index 20712beec41..73f875b46c2 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,13 +16,13 @@ function(find_and_configure_kvikio VERSION) rapids_cpm_find( - KvikIO ${VERSION} + kvikio ${VERSION} GLOBAL_TARGETS kvikio::kvikio CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/kvikio.git GIT_TAG branch-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp - OPTIONS "KvikIO_BUILD_EXAMPLES OFF" + OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) include("${rapids-cmake-dir}/export/find_package_root.cmake") diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 8df1b431095..c440643037b 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,16 +14,24 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) + include(${rapids-cmake-dir}/cpm/package_override.cmake) + + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") + + if(NOT BUILD_SHARED_LIBS) + set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) + else() + set(_exclude_from_all EXCLUDE_FROM_ALL TRUE) + endif() + # Currently we need to always build nanoarrow so we don't pickup a previous installed version set(CPM_DOWNLOAD_nanoarrow ON) rapids_cpm_find( nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS - GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git - GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb - GIT_SHALLOW FALSE - OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" + OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all} ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports) diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 41bbf44abc8..33b1b45fb44 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,11 +16,11 @@ function(find_and_configure_nvcomp) include(${rapids-cmake-dir}/cpm/nvcomp.cmake) - rapids_cpm_nvcomp( - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP} - ) + set(export_args) + if(CUDF_EXPORT_NVCOMP) + set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + endif() + rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) # Per-thread default stream if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff new file mode 100644 index 00000000000..e9a36fcb567 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff @@ -0,0 +1,38 @@ +diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h +index caa6be4..70ec8a2 100644 +--- a/src/nanoarrow/common/inline_buffer.h ++++ b/src/nanoarrow/common/inline_buffer.h +@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + } + + static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { +- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ++ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); +@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l + // set bits within a single byte + const uint8_t only_byte_mask = + i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); +- bits[bytes_begin] &= only_byte_mask; ++ bits[bytes_begin] &= only_byte_mask; // NOLINT + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte +- bits[bytes_begin] &= first_byte_mask; ++ bits[bytes_begin] &= first_byte_mask; // NOLINT + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { +@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte +- *out_cursor = 0x00; ++ *out_cursor = 0x00; // NOLINT + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json new file mode 100644 index 00000000000..d529787e7c8 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json @@ -0,0 +1,18 @@ + +{ + "packages" : { + "nanoarrow" : { + "version" : "0.6.0.dev", + "git_url" : "https://github.com/apache/arrow-nanoarrow.git", + "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", + "git_shallow" : false, + "patches" : [ + { + "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", + "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", + "fixed_in" : "" + } + ] + } + } +} diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index fce8adb4c06..1c1052487f2 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -370,7 +370,7 @@ any type that cudf supports. For example, a `list_scalar` representing a list of |Value type|Scalar class|Notes| |-|-|-| |fixed-width|`fixed_width_scalar`| `T` can be any fixed-width type| -|numeric|`numeric_scalar` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int_64_t`, `float` or `double`| +|numeric|`numeric_scalar` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int64_t`, `float` or `double`| |fixed-point|`fixed_point_scalar` | `T` can be `numeric::decimal32` or `numeric::decimal64`| |timestamp|`timestamp_scalar` | `T` can be `timestamp_D`, `timestamp_s`, etc.| |duration|`duration_scalar` | `T` can be `duration_D`, `duration_s`, etc.| @@ -1483,6 +1483,17 @@ struct, and therefore `cudf::struct_view` is the data type of a `cudf::column` o `cudf::type_dispatcher` dispatches to the `struct_view` data type when invoked on a `STRUCT` column. +# Empty Columns + +The libcudf columns support empty, typed content. These columns have no data and no validity mask. +Empty strings or lists columns may or may not contain a child offsets column. +It is undefined behavior (UB) to access the offsets child of an empty strings or lists column. +Nested columns like lists and structs may require other children columns to provide the +nested structure of the empty types. + +Use `cudf::make_empty_column()` to create fixed-width and strings columns. +Use `cudf::empty_like()` to create an empty column from an existing `cudf::column_view`. + # cuIO: file reading and writing cuIO is a component of libcudf that provides GPU-accelerated reading and writing of data file diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 6d1c91a5752..6902b1948bd 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -8,6 +8,7 @@ This page specifies which regular expression (regex) features are currently supp - cudf::strings::extract() - cudf::strings::extract_all_record() - cudf::strings::findall() +- cudf::strings::find_re() - cudf::strings::replace_re() - cudf::strings::replace_with_backrefs() - cudf::strings::split_re() diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index d8e9205ffd4..a7d0146b170 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -16,10 +16,23 @@ project( include(../fetch_dependencies.cmake) -# Configure your project here +add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp) +target_compile_features(parquet_io_utils PRIVATE cxx_std_17) +target_link_libraries(parquet_io_utils PRIVATE cudf::cudf) + +# Build and install parquet_io add_executable(parquet_io parquet_io.cpp) -target_link_libraries(parquet_io PRIVATE cudf::cudf) +target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $) target_compile_features(parquet_io PRIVATE cxx_std_17) - install(TARGETS parquet_io DESTINATION bin/examples/libcudf) + +# Build and install parquet_io_multithreaded +add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) +target_link_libraries( + parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $ +) +target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17) +install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) + +# Install the example.parquet file install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/common_utils.cpp similarity index 50% rename from cpp/examples/parquet_io/parquet_io.hpp rename to cpp/examples/parquet_io/common_utils.cpp index e27cbec4fce..a79ca48af86 100644 --- a/cpp/examples/parquet_io/parquet_io.hpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -14,30 +14,27 @@ * limitations under the License. */ -#pragma once +#include "common_utils.hpp" -#include +#include #include #include #include -#include -#include #include #include #include #include -#include -#include +#include #include /** - * @brief Create memory resource for libcudf functions + * @file common_utils.cpp + * @brief Definitions for common utilities for `parquet_io` examples * - * @param pool Whether to use a pool memory resource. - * @return Memory resource instance */ + std::shared_ptr create_memory_resource(bool is_pool_used) { auto cuda_mr = std::make_shared(); @@ -48,17 +45,11 @@ std::shared_ptr create_memory_resource(bool is_ return cuda_mr; } -/** - * @brief Get encoding type from the keyword - * - * @param name encoding keyword name - * @return corresponding column encoding type - */ -[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name) +cudf::io::column_encoding get_encoding_type(std::string name) { using encoding_type = cudf::io::column_encoding; - static const std::unordered_map map = { + static std::unordered_map const map = { {"DEFAULT", encoding_type::USE_DEFAULT}, {"DICTIONARY", encoding_type::DICTIONARY}, {"PLAIN", encoding_type::PLAIN}, @@ -69,26 +60,18 @@ std::shared_ptr create_memory_resource(bool is_ std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument("FATAL: " + std::string(name) + + throw std::invalid_argument(name + " is not a valid encoding type.\n\n" "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n" "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n" - "DELTA_BYTE_ARRAY\n" - "\n" - "Exiting...\n"); + "DELTA_BYTE_ARRAY\n\n"); } -/** - * @brief Get compression type from the keyword - * - * @param name compression keyword name - * @return corresponding compression type - */ -[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name) +cudf::io::compression_type get_compression_type(std::string name) { using compression_type = cudf::io::compression_type; - static const std::unordered_map map = { + static std::unordered_map const map = { {"NONE", compression_type::NONE}, {"AUTO", compression_type::AUTO}, {"SNAPPY", compression_type::SNAPPY}, @@ -97,30 +80,58 @@ std::shared_ptr create_memory_resource(bool is_ std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument("FATAL: " + std::string(name) + + throw std::invalid_argument(name + " is not a valid compression type.\n\n" - "Available compression_type types: NONE, AUTO, SNAPPY,\n" - "LZ4, ZSTD\n" - "\n" - "Exiting...\n"); + "Available compression types: NONE, AUTO, SNAPPY,\n" + "LZ4, ZSTD\n\n"); } -/** - * @brief Get the optional page size stat frequency from they keyword - * - * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON - * @return optional page statistics frequency set to full (STATISTICS_COLUMN) - */ -[[nodiscard]] std::optional get_page_size_stats(std::string use_stats) +bool get_boolean(std::string input) { - std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper); + std::transform(input.begin(), input.end(), input.begin(), ::toupper); // Check if the input string matches to any of the following - if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or - not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) { - // Full column and offset indices - STATISTICS_COLUMN - return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN); + return input == "ON" or input == "TRUE" or input == "YES" or input == "Y" or input == "T"; +} + +void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table) +{ + try { + // Left anti-join the original and transcoded tables + // identical tables should not throw an exception and + // return an empty indices vector + auto const indices = cudf::left_anti_join(lhs_table, rhs_table, cudf::null_equality::EQUAL); + + // No exception thrown, check indices + auto const valid = indices->size() == 0; + std::cout << "Tables identical: " << valid << "\n\n"; + } catch (std::exception& e) { + std::cerr << e.what() << std::endl << std::endl; + throw std::runtime_error("Tables identical: false\n\n"); } +} - return std::nullopt; +std::unique_ptr concatenate_tables(std::vector> tables, + rmm::cuda_stream_view stream) +{ + if (tables.size() == 1) { return std::move(tables[0]); } + + std::vector table_views; + table_views.reserve(tables.size()); + std::transform( + tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) { + return tbl->view(); + }); + // Construct the final table + return cudf::concatenate(table_views, stream); +} + +std::string current_date_and_time() +{ + auto const time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + auto const local_time = *std::localtime(&time); + // Stringstream to format the date and time + std::stringstream ss; + ss << std::put_time(&local_time, "%Y-%m-%d-%H-%M-%S"); + return ss.str(); } diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp new file mode 100644 index 00000000000..12896e61a0d --- /dev/null +++ b/cpp/examples/parquet_io/common_utils.hpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include +#include + +/** + * @file common_utils.hpp + * @brief Common utilities for `parquet_io` examples + * + */ + +/** + * @brief Create memory resource for libcudf functions + * + * @param pool Whether to use a pool memory resource. + * @return Memory resource instance + */ +std::shared_ptr create_memory_resource(bool is_pool_used); + +/** + * @brief Get encoding type from the keyword + * + * @param name encoding keyword name + * @return corresponding column encoding type + */ +[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name); + +/** + * @brief Get compression type from the keyword + * + * @param name compression keyword name + * @return corresponding compression type + */ +[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name); + +/** + * @brief Get boolean from they keyword + * + * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON + * @return true or false + */ +[[nodiscard]] bool get_boolean(std::string input); + +/** + * @brief Check if two tables are identical, throw an error otherwise + * + * @param lhs_table View to lhs table + * @param rhs_table View to rhs table + */ +void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table); + +/** + * @brief Concatenate a vector of tables and return the resultant table + * + * @param tables Vector of tables to concatenate + * @param stream CUDA stream to use + * + * @return Unique pointer to the resultant concatenated table. + */ +std::unique_ptr concatenate_tables(std::vector> tables, + rmm::cuda_stream_view stream); + +/** + * @brief Returns a string containing current date and time + * + */ +std::string current_date_and_time(); diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp new file mode 100644 index 00000000000..019b3f96474 --- /dev/null +++ b/cpp/examples/parquet_io/io_source.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io_source.hpp" + +#include +#include + +#include +#include + +#include + +#include +#include +#include + +rmm::host_async_resource_ref pinned_memory_resource() +{ + static auto mr = rmm::mr::pinned_host_memory_resource{}; + return mr; +} + +io_source_type get_io_source_type(std::string name) +{ + static std::unordered_map const map = { + {"FILEPATH", io_source_type::FILEPATH}, + {"HOST_BUFFER", io_source_type::HOST_BUFFER}, + {"PINNED_BUFFER", io_source_type::PINNED_BUFFER}, + {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}}; + + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + if (map.find(name) != map.end()) { + return map.at(name); + } else { + throw std::invalid_argument(name + + " is not a valid io source type. Available: FILEPATH,\n" + "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n"); + } +} + +io_source::io_source(std::string_view file_path, io_source_type type, rmm::cuda_stream_view stream) + : pinned_buffer({pinned_memory_resource(), stream}), d_buffer{0, stream} +{ + std::string const file_name{file_path}; + auto const file_size = std::filesystem::file_size(file_name); + + // For filepath make a quick source_info and return early + if (type == io_source_type::FILEPATH) { + source_info = cudf::io::source_info(file_name); + return; + } + + std::ifstream file{file_name, std::ifstream::binary}; + + // Copy file contents to the specified io source buffer + switch (type) { + case io_source_type::HOST_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + source_info = cudf::io::source_info(h_buffer.data(), file_size); + break; + } + case io_source_type::PINNED_BUFFER: { + pinned_buffer.resize(file_size); + file.read(pinned_buffer.data(), file_size); + source_info = cudf::io::source_info(pinned_buffer.data(), file_size); + break; + } + case io_source_type::DEVICE_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + d_buffer.resize(file_size, stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value())); + + source_info = cudf::io::source_info(d_buffer); + break; + } + default: { + throw std::runtime_error("Encountered unexpected source type\n\n"); + } + } +} diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp new file mode 100644 index 00000000000..a614d348fae --- /dev/null +++ b/cpp/examples/parquet_io/io_source.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include + +#include + +/** + * @file io_source.hpp + * @brief Utilities for constructing the specified IO sources from the input parquet files. + * + */ + +/** + * @brief Available IO source types + */ +enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER }; + +/** + * @brief Get io source type from the string keyword argument + * + * @param name io source type keyword name + * @return io source type + */ +[[nodiscard]] io_source_type get_io_source_type(std::string name); + +/** + * @brief Create and return a reference to a static pinned memory pool + * + * @return Reference to a static pinned memory pool + */ +rmm::host_async_resource_ref pinned_memory_resource(); + +/** + * @brief Custom allocator for pinned_buffer via RMM. + */ +template +struct pinned_allocator : public std::allocator { + pinned_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream) + : mr{_mr}, stream{_stream} + { + } + + T* allocate(std::size_t n) + { + auto ptr = mr.allocate_async(n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + stream.synchronize(); + return static_cast(ptr); + } + + void deallocate(T* ptr, std::size_t n) + { + mr.deallocate_async(ptr, n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + private: + rmm::host_async_resource_ref mr; + rmm::cuda_stream_view stream; +}; + +/** + * @brief Class to create a cudf::io::source_info of given type from the input parquet file + * + */ +class io_source { + public: + io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream); + + // Get the internal source info + [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; } + + private: + // alias for pinned vector + template + using pinned_vector = thrust::host_vector>; + cudf::io::source_info source_info; + std::vector h_buffer; + pinned_vector pinned_buffer; + rmm::device_uvector d_buffer; +}; diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 9cda22d0695..c11b8de82b5 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -14,11 +14,15 @@ * limitations under the License. */ -#include "parquet_io.hpp" - #include "../utilities/timer.hpp" +#include "common_utils.hpp" +#include "io_source.hpp" + +#include +#include +#include -#include +#include /** * @file parquet_io.cpp @@ -81,6 +85,18 @@ void write_parquet(cudf::table_view input, cudf::io::write_parquet(options); } +/** + * @brief Function to print example usage and argument information. + */ +void print_usage() +{ + std::cout << "\nUsage: parquet_io \n" + " \n\n" + "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n" + " DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n" + "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n"; +} + /** * @brief Main for nested_types examples * @@ -97,29 +113,28 @@ void write_parquet(cudf::table_view input, */ int main(int argc, char const** argv) { - std::string input_filepath; - std::string output_filepath; - cudf::io::column_encoding encoding; - cudf::io::compression_type compression; - std::optional page_stats; + std::string input_filepath = "example.parquet"; + std::string output_filepath = "output.parquet"; + cudf::io::column_encoding encoding = get_encoding_type("DELTA_BINARY_PACKED"); + cudf::io::compression_type compression = get_compression_type("ZSTD"); + std::optional page_stats = std::nullopt; switch (argc) { - case 1: - input_filepath = "example.parquet"; - output_filepath = "output.parquet"; - encoding = get_encoding_type("DELTA_BINARY_PACKED"); - compression = get_compression_type("ZSTD"); - break; - case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]]; - case 5: - input_filepath = argv[1]; - output_filepath = argv[2]; - encoding = get_encoding_type(argv[3]); - compression = get_compression_type(argv[4]); - break; - default: - throw std::runtime_error( - "Either provide all command-line arguments, or none to use defaults\n"); + case 6: + page_stats = get_boolean(argv[5]) + ? std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN) + : std::nullopt; + [[fallthrough]]; + case 5: compression = get_compression_type(argv[4]); [[fallthrough]]; + case 4: encoding = get_encoding_type(argv[3]); [[fallthrough]]; + case 3: output_filepath = argv[2]; [[fallthrough]]; + case 2: // Check if instead of input_paths, the first argument is `-h` or `--help` + if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") { + input_filepath = std::move(arg); + break; + } + [[fallthrough]]; + default: print_usage(); throw std::runtime_error(""); } // Create and use a memory pool @@ -130,18 +145,16 @@ int main(int argc, char const** argv) // Read input parquet file // We do not want to time the initial read time as it may include // time for nvcomp, cufile loading and RMM growth - std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl; + std::cout << "\nReading " << input_filepath << "...\n"; std::cout << "Note: Not timing the initial parquet read as it may include\n" - "times for nvcomp, cufile loading and RMM growth." - << std::endl - << std::endl; + "times for nvcomp, cufile loading and RMM growth.\n\n"; auto [input, metadata] = read_parquet(input_filepath); // Status string to indicate if page stats are set to be written or not auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; // Write parquet file with the specified encoding and compression std::cout << "Writing " << output_filepath << " with encoding, compression and " - << page_stat_string << ".." << std::endl; + << page_stat_string << "..\n"; // `timer` is automatically started here cudf::examples::timer timer; @@ -149,7 +162,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Read the parquet file written with encoding and compression - std::cout << "Reading " << output_filepath << "..." << std::endl; + std::cout << "Reading " << output_filepath << "...\n"; // Reset the timer timer.reset(); @@ -157,23 +170,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Check for validity - try { - // Left anti-join the original and transcoded tables - // identical tables should not throw an exception and - // return an empty indices vector - auto const indices = cudf::left_anti_join(input->view(), - transcoded_input->view(), - cudf::null_equality::EQUAL, - cudf::get_default_stream(), - resource.get()); - - // No exception thrown, check indices - auto const valid = indices->size() == 0; - std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; - } catch (std::exception& e) { - std::cerr << e.what() << std::endl << std::endl; - std::cout << "Transcoding valid: false" << std::endl; - } + check_tables_equal(input->view(), transcoded_input->view()); return 0; } diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp new file mode 100644 index 00000000000..6ad4b862240 --- /dev/null +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../utilities/timer.hpp" +#include "common_utils.hpp" +#include "io_source.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/** + * @file parquet_io_multithreaded.cpp + * @brief Demonstrates reading parquet data from the specified io source using multiple threads. + * + * The input parquet data is provided via files which are converted to the specified io source type + * to be read using multiple threads. Optionally, the parquet data read by each thread can be + * written to corresponding files and checked for validatity of the output files against the input + * data. + * + * Run: ``parquet_io_multithreaded -h`` to see help with input args and more information. + * + * The following io source types are supported: + * IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER + * + */ + +// Type alias for unique ptr to cudf table +using table_t = std::unique_ptr; + +/** + * @brief Behavior when handling the read tables by multiple threads + */ +enum class read_mode { + NO_CONCATENATE, ///< Only read and discard tables + CONCATENATE_THREAD, ///< Read and concatenate tables from each thread + CONCATENATE_ALL, ///< Read and concatenate everything to a single table +}; + +/** + * @brief Functor for multithreaded parquet reading based on the provided read_mode + */ +template +struct read_fn { + std::vector const& input_sources; + std::vector& tables; + int const thread_id; + int const thread_count; + rmm::cuda_stream_view stream; + + void operator()() + { + // Tables read by this thread + std::vector tables_this_thread; + + // Sweep the available input files + for (auto curr_file_idx = thread_id; curr_file_idx < input_sources.size(); + curr_file_idx += thread_count) { + auto builder = + cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info()); + auto const options = builder.build(); + if constexpr (read_mode != read_mode::NO_CONCATENATE) { + tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); + } else { + cudf::io::read_parquet(options, stream); + } + } + + // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode. + if constexpr (read_mode != read_mode::NO_CONCATENATE) { + auto table = concatenate_tables(std::move(tables_this_thread), stream); + stream.synchronize_no_throw(); + tables[thread_id] = std::move(table); + } else { + // Just synchronize this stream and exit + stream.synchronize_no_throw(); + } + } +}; + +/** + * @brief Function to setup and launch multithreaded parquet reading. + * + * @tparam read_mode Specifies if to concatenate and return the actual + * tables or discard them and return an empty vector + * + * @param input_sources List of input sources to read + * @param thread_count Number of threads + * @param stream_pool CUDA stream pool to use for threads + * + * @return Vector of read tables. + */ +template +std::vector read_parquet_multithreaded(std::vector const& input_sources, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) +{ + // Tables read by each thread + std::vector tables(thread_count); + + // Table reading tasks + std::vector> read_tasks; + read_tasks.reserve(thread_count); + + // Create the read tasks + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { + read_tasks.emplace_back( + read_fn{input_sources, tables, tid, thread_count, stream_pool.get_stream()}); + }); + + // Create threads with tasks + std::vector threads; + threads.reserve(thread_count); + for (auto& c : read_tasks) { + threads.emplace_back(c); + } + for (auto& t : threads) { + t.join(); + } + + // If CONCATENATE_ALL mode, then concatenate to a vector of one final table. + if (read_mode == read_mode::CONCATENATE_ALL) { + auto stream = stream_pool.get_stream(); + auto final_tbl = concatenate_tables(std::move(tables), stream); + stream.synchronize(); + tables.clear(); + tables.emplace_back(std::move(final_tbl)); + } + + return tables; +} + +/** + * @brief Functor for multithreaded parquet writing + */ +struct write_fn { + std::string const& output_path; + std::vector const& table_views; + int const thread_id; + rmm::cuda_stream_view stream; + + void operator()() + { + // Create a sink + cudf::io::sink_info const sink_info{output_path + "/table_" + std::to_string(thread_id) + + ".parquet"}; + // Writer options builder + auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]); + // Create a new metadata for the table + auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]}; + + builder.metadata(table_metadata); + auto options = builder.build(); + + // Write parquet data + cudf::io::write_parquet(options, stream); + + // Done with this stream + stream.synchronize_no_throw(); + } +}; + +/** + * @brief Function to setup and launch multithreaded writing parquet files. + * + * @param output_path Path to output directory + * @param tables List of at least table views to be written + * @param thread_count Number of threads to use for writing tables. + * @param stream_pool CUDA stream pool to use for threads + * + */ +void write_parquet_multithreaded(std::string const& output_path, + std::vector const& tables, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) +{ + // Table writing tasks + std::vector write_tasks; + write_tasks.reserve(thread_count); + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { + write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()}); + }); + + // Writer threads + std::vector threads; + threads.reserve(thread_count); + for (auto& c : write_tasks) { + threads.emplace_back(c); + } + for (auto& t : threads) { + t.join(); + } +} + +/** + * @brief Function to print example usage and argument information. + */ +void print_usage() +{ + std::cout + << "\nUsage: parquet_io_multithreaded \n" + " \n" + " \n\n" + "Available IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER (Default), " + "DEVICE_BUFFER\n\n" + "Note: Provide as many arguments as you like in the above order. Default values\n" + " for the unprovided arguments will be used. All input parquet files will\n" + " be converted to the specified IO source type before reading\n\n"; +} + +/** + * @brief Function to process comma delimited input paths string to parquet files and/or dirs + * and convert them to specified io sources. + * + * Process the input path string containing directories (of parquet files) and/or individual + * parquet files into a list of input parquet files, multiple the list by `input_multiplier`, + * make sure to have at least `thread_count` files to satisfy at least file per parallel thread, + * and convert the final list of files to a list of `io_source` and return. + * + * @param paths Comma delimited input paths string + * @param input_multiplier Multiplier for the input files list + * @param thread_count Number of threads being used in the example + * @param io_source_type Specified IO source type to convert input files to + * @param stream CUDA stream to use + * + * @return Vector of input sources for the given paths + */ +std::vector extract_input_sources(std::string const& paths, + int32_t input_multiplier, + int32_t thread_count, + io_source_type io_source_type, + rmm::cuda_stream_view stream) +{ + // Get the delimited paths to directory and/or files. + std::vector const delimited_paths = [&]() { + std::vector paths_list; + std::stringstream strstream{paths}; + std::string path; + // Extract the delimited paths. + while (std::getline(strstream, path, char{','})) { + paths_list.push_back(path); + } + return paths_list; + }(); + + // List of parquet files + std::vector parquet_files; + std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { + std::filesystem::path path{path_string}; + // If this is a parquet file, add it. + if (std::filesystem::is_regular_file(path)) { + parquet_files.push_back(path_string); + } + // If this is a directory, add all files in the directory. + else if (std::filesystem::is_directory(path)) { + for (auto const& file : std::filesystem::directory_iterator(path)) { + if (std::filesystem::is_regular_file(file.path())) { + parquet_files.push_back(file.path().string()); + } else { + std::cout << "Skipping sub-directory: " << file.path().string() << "\n"; + } + } + } else { + print_usage(); + throw std::runtime_error("Encountered an invalid input path\n"); + } + }); + + // Current size of list of parquet files + auto const initial_size = parquet_files.size(); + if (initial_size == 0) { return {}; } + + // Reserve space + parquet_files.reserve(std::max(thread_count, input_multiplier * parquet_files.size())); + + // Append the input files by input_multiplier times + std::for_each(thrust::make_counting_iterator(1), + thrust::make_counting_iterator(input_multiplier), + [&](auto i) { + parquet_files.insert(parquet_files.end(), + parquet_files.begin(), + parquet_files.begin() + initial_size); + }); + + // Cycle append parquet files from the existing ones if less than the thread_count + std::cout << "Warning: Number of input sources < thread count. Cycling from\n" + "and appending to current input sources such that the number of\n" + "input source == thread count\n"; + for (size_t idx = 0; thread_count > static_cast(parquet_files.size()); idx++) { + parquet_files.emplace_back(parquet_files[idx % initial_size]); + } + + // Vector of io sources + std::vector input_sources; + input_sources.reserve(parquet_files.size()); + // Transform input files to the specified io sources + std::transform(parquet_files.begin(), + parquet_files.end(), + std::back_inserter(input_sources), + [&](auto const& file_name) { + return io_source{file_name, io_source_type, stream}; + }); + stream.synchronize(); + return input_sources; +} + +/** + * @brief The main function + */ +int32_t main(int argc, char const** argv) +{ + // Set arguments to defaults + std::string input_paths = "example.parquet"; + int32_t input_multiplier = 1; + int32_t num_reads = 1; + int32_t thread_count = 1; + io_source_type io_source_type = io_source_type::PINNED_BUFFER; + bool write_and_validate = false; + + // Set to the provided args + switch (argc) { + case 7: write_and_validate = get_boolean(argv[6]); [[fallthrough]]; + case 6: thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); [[fallthrough]]; + case 5: num_reads = std::max(1, std::stoi(argv[4])); [[fallthrough]]; + case 4: io_source_type = get_io_source_type(argv[3]); [[fallthrough]]; + case 3: + input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]})); + [[fallthrough]]; + case 2: + // Check if instead of input_paths, the first argument is `-h` or `--help` + if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") { + input_paths = std::move(arg); + break; + } + [[fallthrough]]; + default: print_usage(); throw std::runtime_error(""); + } + + // Initialize mr, default stream and stream pool + auto const is_pool_used = true; + auto resource = create_memory_resource(is_pool_used); + auto default_stream = cudf::get_default_stream(); + auto stream_pool = rmm::cuda_stream_pool(thread_count); + auto stats_mr = + rmm::mr::statistics_resource_adaptor(resource.get()); + rmm::mr::set_current_device_resource(&stats_mr); + + // List of input sources from the input_paths string. + auto const input_sources = extract_input_sources( + input_paths, input_multiplier, thread_count, io_source_type, default_stream); + + // Check if there is nothing to do + if (input_sources.empty()) { + print_usage(); + throw std::runtime_error("No input files to read. Exiting early.\n"); + } + + // Read the same parquet files specified times with multiple threads and discard the read tables + { + // Print status + std::cout << "\nReading " << input_sources.size() << " input sources " << num_reads + << " time(s) using " << thread_count + << " threads and discarding output " + "tables..\n"; + + if (io_source_type == io_source_type::FILEPATH) { + std::cout << "Note that the first read may include times for nvcomp, cufile loading and RMM " + "growth.\n\n"; + } + + cudf::examples::timer timer; + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_reads), + [&](auto i) { // Read parquet files and discard the tables + std::ignore = read_parquet_multithreaded( + input_sources, thread_count, stream_pool); + }); + default_stream.synchronize(); + timer.print_elapsed_millis(); + } + + // Write parquet files and validate if needed + if (write_and_validate) { + // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables + auto const tables = read_parquet_multithreaded( + input_sources, thread_count, stream_pool); + default_stream.synchronize(); + + // Construct a vector of table views for write_parquet_multithreaded + auto const table_views = [&tables]() { + std::vector table_views; + table_views.reserve(tables.size()); + std::transform( + tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { + return tbl->view(); + }); + return table_views; + }(); + + // Write tables to parquet + std::cout << "Writing parquet output files..\n"; + + // Create a directory at the tmpdir path. + std::string output_path = + std::filesystem::temp_directory_path().string() + "/output_" + current_date_and_time(); + std::filesystem::create_directory({output_path}); + cudf::examples::timer timer; + write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool); + default_stream.synchronize(); + timer.print_elapsed_millis(); + + // Verify the output + std::cout << "Verifying output..\n"; + + // Simply concatenate the previously read tables from input sources + auto const input_table = cudf::concatenate(table_views, default_stream); + + // Sources from written parquet files + auto const written_pq_sources = extract_input_sources( + output_path, input_multiplier, thread_count, io_source_type, default_stream); + + // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only + auto const transcoded_table = std::move(read_parquet_multithreaded( + written_pq_sources, thread_count, stream_pool) + .back()); + default_stream.synchronize(); + + // Check if the tables are identical + check_tables_equal(input_table->view(), transcoded_table->view()); + + // Remove the created temp directory and parquet data + std::filesystem::remove_all(output_path); + } + + // Print peak memory + std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n"; + + return 0; +} diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake index 44493011673..51613090534 100644 --- a/cpp/examples/versions.cmake +++ b/cpp/examples/versions.cmake @@ -12,4 +12,4 @@ # the License. # ============================================================================= -set(CUDF_TAG branch-24.10) +set(CUDF_TAG branch-24.12) diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index a254171ef11..b5973d0ace9 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -17,10 +17,10 @@ #include #include -#include #include #include #include +#include #include @@ -300,7 +300,7 @@ class expression_parser { * @return The indices of the operands stored in the data references. */ std::vector visit_operands( - std::vector> operands); + cudf::host_span const> operands); /** * @brief Add a data reference to the internal list. diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 4299ee5f20f..bcc9ad1b391 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -22,6 +22,8 @@ #include #include +#include +#include namespace CUDF_EXPORT cudf { namespace ast { @@ -478,7 +480,7 @@ class operation : public expression { * * @return Vector of operands */ - [[nodiscard]] std::vector> get_operands() const + [[nodiscard]] std::vector> const& get_operands() const { return operands; } @@ -506,8 +508,8 @@ class operation : public expression { }; private: - ast_operator const op; - std::vector> const operands; + ast_operator op; + std::vector> operands; }; /** @@ -552,6 +554,98 @@ class column_name_reference : public expression { std::string column_name; }; +/** + * @brief An AST expression tree. It owns and contains multiple dependent expressions. All the + * expressions are destroyed when the tree is destructed. + */ +class tree { + public: + /** + * @brief construct an empty ast tree + */ + tree() = default; + + /** + * @brief Moves the ast tree + */ + tree(tree&&) = default; + + /** + * @brief move-assigns the AST tree + * @returns a reference to the move-assigned tree + */ + tree& operator=(tree&&) = default; + + ~tree() = default; + + // the tree is not copyable + tree(tree const&) = delete; + tree& operator=(tree const&) = delete; + + /** + * @brief Add an expression to the AST tree + * @param args Arguments to use to construct the ast expression + * @returns a reference to the added expression + */ + template + Expr const& emplace(Args&&... args) + { + static_assert(std::is_base_of_v); + auto expr = std::make_shared(std::forward(args)...); + Expr const& expr_ref = *expr; + expressions.emplace_back(std::static_pointer_cast(std::move(expr))); + return expr_ref; + } + + /** + * @brief Add an expression to the AST tree + * @param expr AST expression to be added + * @returns a reference to the added expression + */ + template + Expr const& push(Expr expr) + { + return emplace(std::move(expr)); + } + + /** + * @brief get the first expression in the tree + * @returns the first inserted expression into the tree + */ + expression const& front() const { return *expressions.front(); } + + /** + * @brief get the last expression in the tree + * @returns the last inserted expression into the tree + */ + expression const& back() const { return *expressions.back(); } + + /** + * @brief get the number of expressions added to the tree + * @returns the number of expressions added to the tree + */ + size_t size() const { return expressions.size(); } + + /** + * @brief get the expression at an index in the tree. Index is checked. + * @param index index of expression in the ast tree + * @returns the expression at the specified index + */ + expression const& at(size_t index) { return *expressions.at(index); } + + /** + * @brief get the expression at an index in the tree. Index is unchecked. + * @param index index of expression in the ast tree + * @returns the expression at the specified index + */ + expression const& operator[](size_t index) const { return *expressions[index]; } + + private: + // TODO: use better ownership semantics, the shared_ptr here is redundant. Consider using a bump + // allocator with type-erased deleters. + std::vector> expressions; +}; + /** @} */ // end of group } // namespace ast diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index c3238cb94fd..35a39ef9758 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1425,13 +1425,13 @@ struct pair_rep_accessor { private: template , void>* = nullptr> - __device__ inline auto get_rep(cudf::size_type i) const + __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const { return col.element(i); } template , void>* = nullptr> - __device__ inline auto get_rep(cudf::size_type i) const + __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const { return col.element(i).value(); } diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index c3b68b52c36..e72661ce49a 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -24,8 +24,6 @@ #include -#include - namespace CUDF_EXPORT cudf { /** * @addtogroup column_factories @@ -378,6 +376,26 @@ std::unique_ptr make_strings_column( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Construct a batch of STRING type columns given an array of device spans of pointer/size + * pairs. + * + * This function has input/output expectation similar to the `make_strings_column()` API that + * accepts only one device span of pointer/size pairs. The difference is that, this is designed to + * create many strings columns at once with minimal overhead of multiple kernel launches and + * stream synchronizations. + * + * @param input Array of device spans of pointer/size pairs, where each pointer is a device memory + * address or `nullptr` (indicating a null string), and size is string length (in bytes) + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for memory allocation of the output columns + * @return Array of constructed strings columns + */ +std::vector> make_strings_column_batch( + std::vector const>> const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Construct a STRING type column given a device span of string_view. * diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 3ef7bafe727..6db5c8b3c7b 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -16,7 +16,6 @@ #pragma once #include -#include #include #include #include @@ -235,7 +234,7 @@ class column_view_base { * * @return Typed pointer to underlying data */ - virtual void const* get_data() const noexcept { return _data; } + [[nodiscard]] virtual void const* get_data() const noexcept { return _data; } data_type _type{type_id::EMPTY}; ///< Element type size_type _size{}; ///< Number of elements @@ -695,7 +694,7 @@ class mutable_column_view : public detail::column_view_base { * * @return Typed pointer to underlying data */ - void const* get_data() const noexcept override; + [[nodiscard]] void const* get_data() const noexcept override; private: friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type); diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 7359a0d5fde..1f6e86d0389 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -38,10 +38,28 @@ namespace datetime { * @file */ +/** + * @brief Types of datetime components that may be extracted. + */ +enum class datetime_component : uint8_t { + YEAR, + MONTH, + DAY, + WEEKDAY, + HOUR, + MINUTE, + SECOND, + MILLISECOND, + MICROSECOND, + NANOSECOND +}; + /** * @brief Extracts year from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -49,7 +67,7 @@ namespace datetime { * @returns cudf::column of the extracted int16_t years * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_year( +[[deprecated]] std::unique_ptr extract_year( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -58,6 +76,8 @@ std::unique_ptr extract_year( * @brief Extracts month from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -65,7 +85,7 @@ std::unique_ptr extract_year( * @returns cudf::column of the extracted int16_t months * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_month( +[[deprecated]] std::unique_ptr extract_month( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -74,6 +94,8 @@ std::unique_ptr extract_month( * @brief Extracts day from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -81,7 +103,7 @@ std::unique_ptr extract_month( * @returns cudf::column of the extracted int16_t days * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_day( +[[deprecated]] std::unique_ptr extract_day( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -90,6 +112,8 @@ std::unique_ptr extract_day( * @brief Extracts a weekday from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -97,7 +121,7 @@ std::unique_ptr extract_day( * @returns cudf::column of the extracted int16_t days * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_weekday( +[[deprecated]] std::unique_ptr extract_weekday( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -106,6 +130,8 @@ std::unique_ptr extract_weekday( * @brief Extracts hour from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -113,7 +139,7 @@ std::unique_ptr extract_weekday( * @returns cudf::column of the extracted int16_t hours * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_hour( +[[deprecated]] std::unique_ptr extract_hour( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -122,6 +148,8 @@ std::unique_ptr extract_hour( * @brief Extracts minute from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -129,7 +157,7 @@ std::unique_ptr extract_hour( * @returns cudf::column of the extracted int16_t minutes * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_minute( +[[deprecated]] std::unique_ptr extract_minute( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -138,6 +166,8 @@ std::unique_ptr extract_minute( * @brief Extracts second from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -145,7 +175,7 @@ std::unique_ptr extract_minute( * @returns cudf::column of the extracted int16_t seconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_second( +[[deprecated]] std::unique_ptr extract_second( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -157,6 +187,8 @@ std::unique_ptr extract_second( * A millisecond fraction is only the 3 digits that make up the millisecond portion of a duration. * For example, the millisecond fraction of 1.234567890 seconds is 234. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -164,7 +196,7 @@ std::unique_ptr extract_second( * @returns cudf::column of the extracted int16_t milliseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_millisecond_fraction( +[[deprecated]] std::unique_ptr extract_millisecond_fraction( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -176,6 +208,8 @@ std::unique_ptr extract_millisecond_fraction( * A microsecond fraction is only the 3 digits that make up the microsecond portion of a duration. * For example, the microsecond fraction of 1.234567890 seconds is 567. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -183,7 +217,7 @@ std::unique_ptr extract_millisecond_fraction( * @returns cudf::column of the extracted int16_t microseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_microsecond_fraction( +[[deprecated]] std::unique_ptr extract_microsecond_fraction( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -195,6 +229,8 @@ std::unique_ptr extract_microsecond_fraction( * A nanosecond fraction is only the 3 digits that make up the nanosecond portion of a duration. * For example, the nanosecond fraction of 1.234567890 seconds is 890. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -202,8 +238,26 @@ std::unique_ptr extract_microsecond_fraction( * @returns cudf::column of the extracted int16_t nanoseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_nanosecond_fraction( +[[deprecated]] std::unique_ptr extract_nanosecond_fraction( + cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Extracts the specified datetime component from any datetime type and + * returns an int16_t cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param component The datetime component to extract + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t datetime component + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_datetime_component( cudf::column_view const& column, + datetime_component component, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index ecf2f610697..de53e7586cd 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -18,11 +18,11 @@ #include #include +#include #include #include #include -#include -#include +#include #include #include @@ -30,8 +30,17 @@ #include +#include +#include + namespace cudf { namespace detail { +template +constexpr bool is_product_supported() +{ + return is_numeric(); +} + /** * @brief Maps an `aggregation::Kind` value to it's corresponding binary * operator. @@ -113,465 +122,6 @@ constexpr bool has_corresponding_operator() return !std::is_same_v::type, void>; } -template -struct update_target_element { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - CUDF_UNREACHABLE("Invalid source type and aggregation combination."); - } -}; - -template -struct update_target_element< - Source, - aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !is_fixed_point()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !is_fixed_point()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -/** - * @brief Function object to update a single element in a target column using - * the dictionary key addressed by the specific index. - * - * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a - * dictionary. - * - */ -template -struct update_target_from_dictionary { - template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - update_target_element{}( - target, target_index, source, source_index); - } - template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - } -}; - -/** - * @brief Specialization function for dictionary type and aggregations. - * - * The `source` column is a dictionary type. This functor de-references the - * dictionary's keys child column and maps the input source index through - * the dictionary's indices child column to pass to the `update_target_element` - * in the above `update_target_from_dictionary` using the type-dispatcher to - * resolve the keys column type. - * - * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` - * - * @tparam target_has_nulls Indicates presence of null elements in `target` - * @tparam source_has_nulls Indicates presence of null elements in `source`. - */ -template -struct update_target_element< - dictionary32, - k, - target_has_nulls, - source_has_nulls, - std::enable_if_t> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - dispatch_type_and_aggregation( - source.child(cudf::dictionary_column_view::keys_column_index).type(), - k, - update_target_from_dictionary{}, - target, - target_index, - source.child(cudf::dictionary_column_view::keys_column_index), - static_cast(source.element(source_index))); - } -}; - -template -constexpr bool is_product_supported() -{ - return is_numeric(); -} - -template -struct update_target_element()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto value = static_cast(source.element(source_index)); - cudf::detail::atomic_add(&target.element(target_index), value * value); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_mul(&target.element(target_index), - static_cast(source.element(source_index))); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::COUNT_VALID, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), Target{1}); - - // It is assumed the output for COUNT_VALID is initialized to be all valid - } -}; - -template -struct update_target_element< - Source, - aggregation::COUNT_ALL, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), Target{1}); - - // It is assumed the output for COUNT_ALL is initialized to be all valid - } -}; - -template -struct update_target_element< - Source, - aggregation::ARGMAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto old = cudf::detail::atomic_cas( - &target.element(target_index), ARGMAX_SENTINEL, source_index); - if (old != ARGMAX_SENTINEL) { - while (source.element(source_index) > source.element(old)) { - old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); - } - } - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::ARGMIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto old = cudf::detail::atomic_cas( - &target.element(target_index), ARGMIN_SENTINEL, source_index); - if (old != ARGMIN_SENTINEL) { - while (source.element(source_index) < source.element(old)) { - old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); - } - } - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -/** - * @brief Function object to update a single element in a target column by - * performing an aggregation operation with a single element from a source - * column. - * - * @tparam target_has_nulls Indicates presence of null elements in `target` - * @tparam source_has_nulls Indicates presence of null elements in `source`. - */ -template -struct elementwise_aggregator { - template - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - update_target_element{}( - target, target_index, source, source_index); - } -}; - -/** - * @brief Updates a row in `target` by performing elementwise aggregation - * operations with a row in `source`. - * - * For the row in `target` specified by `target_index`, each element at `i` is - * updated by: - * ```c++ - * target_row[i] = aggs[i](target_row[i], source_row[i]) - * ``` - * - * This function only supports aggregations that can be done in a "single pass", - * i.e., given an initial value `R`, the aggregation `op` can be computed on a series - * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order - * of the values of `i`. - * - * The initial value and validity of `R` depends on the aggregation: - * SUM: 0 and NULL - * MIN: Max value of type and NULL - * MAX: Min value of type and NULL - * COUNT_VALID: 0 and VALID - * COUNT_ALL: 0 and VALID - * ARGMAX: `ARGMAX_SENTINEL` and NULL - * ARGMIN: `ARGMIN_SENTINEL` and NULL - * - * It is required that the elements of `target` be initialized with the corresponding - * initial values and validity specified above. - * - * Handling of null elements in both `source` and `target` depends on the aggregation: - * SUM, MIN, MAX, ARGMIN, ARGMAX: - * - `source`: Skipped - * - `target`: Updated from null to valid upon first successful aggregation - * COUNT_VALID, COUNT_ALL: - * - `source`: Skipped - * - `target`: Cannot be null - * - * @param target Table containing the row to update - * @param target_index Index of the row to update in `target` - * @param source Table containing the row used to update the row in `target`. - * The invariant `source.num_columns() >= target.num_columns()` must hold. - * @param source_index Index of the row to use in `source` - * @param aggs Array of aggregations to perform between elements of the `target` - * and `source` rows. Must contain at least `target.num_columns()` valid - * `aggregation::Kind` values. - */ -template -__device__ inline void aggregate_row(mutable_table_device_view target, - size_type target_index, - table_device_view source, - size_type source_index, - aggregation::Kind const* aggs) -{ - for (auto i = 0; i < target.num_columns(); ++i) { - dispatch_type_and_aggregation(source.column(i).type(), - aggs[i], - elementwise_aggregator{}, - target.column(i), - target_index, - source.column(i), - source_index); - } -} - /** * @brief Dispatched functor to initialize a column with the identity of an * aggregation operation. diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 4255faea702..6661a461b8b 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -683,7 +683,7 @@ class ewma_aggregation final : public scan_aggregation { { } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -694,7 +694,7 @@ class ewma_aggregation final : public scan_aggregation { return collector.visit(col_type, *this); } - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh new file mode 100644 index 00000000000..204eee49a2a --- /dev/null +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::detail { +/// Checks if an aggregation kind needs to operate on the underlying storage type +template +__device__ constexpr bool uses_underlying_type() +{ + return k == aggregation::MIN or k == aggregation::MAX or k == aggregation::SUM; +} + +/// Gets the underlying target type for the given source type and aggregation kind +template +using underlying_target_t = + cuda::std::conditional_t(), + cudf::device_storage_type_t>, + cudf::detail::target_type_t>; + +/// Gets the underlying source type for the given source type and aggregation kind +template +using underlying_source_t = + cuda::std::conditional_t(), cudf::device_storage_type_t, Source>; + +template +struct update_target_element { + __device__ void operator()(mutable_column_device_view, + size_type, + column_device_view, + size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element< + Source, + aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !is_fixed_point()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MIN, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !is_fixed_point()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MAX, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::SUM, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief Function object to update a single element in a target column using + * the dictionary key addressed by the specific index. + * + * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a + * dictionary. + * + */ +struct update_target_from_dictionary { + template ()>* = nullptr> + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + update_target_element{}(target, target_index, source, source_index); + } + template ()>* = nullptr> + __device__ void operator()(mutable_column_device_view, + size_type, + column_device_view, + size_type) const noexcept + { + } +}; + +/** + * @brief Specialization function for dictionary type and aggregations. + * + * The `source` column is a dictionary type. This functor de-references the + * dictionary's keys child column and maps the input source index through + * the dictionary's indices child column to pass to the `update_target_element` + * in the above `update_target_from_dictionary` using the type-dispatcher to + * resolve the keys column type. + * + * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` + */ +template +struct update_target_element< + dictionary32, + k, + cuda::std::enable_if_t> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + dispatch_type_and_aggregation( + source.child(cudf::dictionary_column_view::keys_column_index).type(), + k, + update_target_from_dictionary{}, + target, + target_index, + source.child(cudf::dictionary_column_view::keys_column_index), + static_cast(source.element(source_index))); + } +}; + +template +struct update_target_element()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + auto value = static_cast(source.element(source_index)); + cudf::detail::atomic_add(&target.element(target_index), value * value); + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source.element(source_index))); + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::COUNT_VALID, + cuda::std::enable_if_t()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), Target{1}); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +template +struct update_target_element< + Source, + aggregation::COUNT_ALL, + cuda::std::enable_if_t()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), Target{1}); + + // It is assumed the output for COUNT_ALL is initialized to be all valid + } +}; + +template +struct update_target_element< + Source, + aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMAX_SENTINEL, source_index); + if (old != ARGMAX_SENTINEL) { + while (source.element(source_index) > source.element(old)) { + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMIN_SENTINEL, source_index); + if (old != ARGMIN_SENTINEL) { + while (source.element(source_index) < source.element(old)) { + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief Function object to update a single element in a target column by + * performing an aggregation operation with a single element from a source + * column. + */ +struct elementwise_aggregator { + template + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source.is_null(source_index)) { return; } + } + update_target_element{}(target, target_index, source, source_index); + } +}; + +/** + * @brief Updates a row in `target` by performing elementwise aggregation + * operations with a row in `source`. + * + * For the row in `target` specified by `target_index`, each element at `i` is + * updated by: + * ```c++ + * target_row[i] = aggs[i](target_row[i], source_row[i]) + * ``` + * + * This function only supports aggregations that can be done in a "single pass", + * i.e., given an initial value `R`, the aggregation `op` can be computed on a series + * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order + * of the values of `i`. + * + * The initial value and validity of `R` depends on the aggregation: + * SUM: 0 and NULL + * MIN: Max value of type and NULL + * MAX: Min value of type and NULL + * COUNT_VALID: 0 and VALID + * COUNT_ALL: 0 and VALID + * ARGMAX: `ARGMAX_SENTINEL` and NULL + * ARGMIN: `ARGMIN_SENTINEL` and NULL + * + * It is required that the elements of `target` be initialized with the corresponding + * initial values and validity specified above. + * + * Handling of null elements in both `source` and `target` depends on the aggregation: + * SUM, MIN, MAX, ARGMIN, ARGMAX: + * - `source`: Skipped + * - `target`: Updated from null to valid upon first successful aggregation + * COUNT_VALID, COUNT_ALL: + * - `source`: Skipped + * - `target`: Cannot be null + * + * @param target Table containing the row to update + * @param target_index Index of the row to update in `target` + * @param source Table containing the row used to update the row in `target`. + * The invariant `source.num_columns() >= target.num_columns()` must hold. + * @param source_index Index of the row to use in `source` + * @param aggs Array of aggregations to perform between elements of the `target` + * and `source` rows. Must contain at least `target.num_columns()` valid + * `aggregation::Kind` values. + */ +__device__ inline void aggregate_row(mutable_table_device_view target, + size_type target_index, + table_device_view source, + size_type source_index, + aggregation::Kind const* aggs) +{ + for (auto i = 0; i < target.num_columns(); ++i) { + dispatch_type_and_aggregation(source.column(i).type(), + aggs[i], + elementwise_aggregator{}, + target.column(i), + target_index, + source.column(i), + source_index); + } +} +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index ec5a511bb7c..486808ebe18 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -19,7 +19,6 @@ #include #include #include -#include #include diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index dfb646c66c4..4159e324472 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -36,7 +37,6 @@ #include #include -#include #include #include @@ -256,7 +256,7 @@ struct scatter_gather_functor { cudf::detail::grid_1d grid{input.size(), block_size, per_thread}; - rmm::device_scalar null_count{0, stream}; + cudf::detail::device_scalar null_count{0, stream}; if (output.nullable()) { // Have to initialize the output mask to all zeros because we may update // it with atomicOr(). diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index a70cd5a0661..5dc75b1a3fb 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -19,12 +19,11 @@ #include #include #include +#include #include #include #include -#include - #include #include @@ -171,7 +170,7 @@ std::unique_ptr copy_if_else(bool nullable, // if we have validity in the output if (nullable) { - rmm::device_scalar valid_count{0, stream}; + cudf::detail::device_scalar valid_count{0, stream}; // call the kernel copy_if_else_kernel diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 3aa136d630b..fcb80fe45f7 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -154,7 +154,7 @@ void copy_range(SourceValueIterator source_value_begin, auto grid = cudf::detail::grid_1d{num_items, block_size, 1}; if (target.nullable()) { - rmm::device_scalar null_count(target.null_count(), stream); + cudf::detail::device_scalar null_count(target.null_count(), stream); auto kernel = copy_range_kernel; diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 9db7e48498f..df3050d6494 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -115,6 +115,16 @@ std::unique_ptr extract_nanosecond_fraction(cudf::column_view cons rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @copydoc cudf::extract_datetime_component(cudf::column_view const&, datetime_component, + * rmm::cuda_stream_view, rmm::device_async_resource_ref) + * + */ +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp new file mode 100644 index 00000000000..16ca06c6561 --- /dev/null +++ b/cpp/include/cudf/detail/device_scalar.hpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +template +class device_scalar : public rmm::device_scalar { + public: +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + ~device_scalar() = default; + +// Implementation is the same as what compiler should generate +// Could not use default move constructor as 11.8 compiler fails to generate it +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + device_scalar(device_scalar&& other) noexcept + : rmm::device_scalar{std::move(other)}, bounce_buffer{std::move(other.bounce_buffer)} + { + } + device_scalar& operator=(device_scalar&&) noexcept = default; + + device_scalar(device_scalar const&) = delete; + device_scalar& operator=(device_scalar const&) = delete; + + device_scalar() = delete; + + explicit device_scalar( + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + } + + explicit device_scalar( + T const& initial_value, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + bounce_buffer[0] = initial_value; + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + device_scalar(device_scalar const& other, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(other, stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + } + + [[nodiscard]] T value(rmm::cuda_stream_view stream) const + { + cuda_memcpy(bounce_buffer, device_span{this->data(), 1}, stream); + return bounce_buffer[0]; + } + + void set_value_async(T const& value, rmm::cuda_stream_view stream) + { + bounce_buffer[0] = value; + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + void set_value_async(T&& value, rmm::cuda_stream_view stream) + { + bounce_buffer[0] = std::move(value); + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + void set_value_to_zero_async(rmm::cuda_stream_view stream) { set_value_async(T{}, stream); } + + private: + mutable cudf::detail::host_vector bounce_buffer; +}; + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index ce8783d8b79..d7a42d0eca5 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -211,7 +211,6 @@ struct sort_groupby_helper { */ column_view keys_bitmask_column(rmm::cuda_stream_view stream); - private: column_ptr _key_sorted_order; ///< Indices to produce _keys in sorted order column_ptr _unsorted_keys_labels; ///< Group labels for unsorted _keys column_ptr _keys_bitmask_column; ///< Column representing rows with one or more nulls values diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh deleted file mode 100644 index 7de79b31bc7..00000000000 --- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -namespace cudf::detail { - -using hash_map_type = cuco::legacy:: - static_map>; - -/** - * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are - * rows that compared equal. - * - * TODO: We need to switch to use `static_reduction_map` when it is ready - * (https://github.com/NVIDIA/cuCollections/pull/98). - */ -template -struct reduce_by_row_fn_base { - protected: - MapView const d_map; - KeyHasher const d_hasher; - KeyEqual const d_equal; - OutputType* const d_output; - - reduce_by_row_fn_base(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - OutputType* const d_output) - : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output} - { - } - - /** - * @brief Return a pointer to the output array at the given index. - * - * @param idx The access index - * @return A pointer to the given index in the output array - */ - __device__ OutputType* get_output_ptr(size_type const idx) const - { - auto const iter = d_map.find(idx, d_hasher, d_equal); - - if (iter != d_map.end()) { - // Only one (undetermined) index value of the duplicate rows could be inserted into the map. - // As such, looking up for all indices of duplicate rows always returns the same value. - auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed); - - // All duplicate rows will have concurrent access to this same output slot. - return &d_output[inserted_idx]; - } else { - // All input `idx` values have been inserted into the map before. - // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if - // `d_equal(idx, idx) == false`. - // Such situations are due to comparing nulls or NaNs which are considered as always unequal. - // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct - // output slot. - return &d_output[idx]; - } - } -}; - -/** - * @brief Perform a reduction on groups of rows that are compared equal. - * - * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared - * equal. A hash table is used to find groups of equal rows. - * - * At the beginning of the operation, the entire output array is filled with a value given by - * the `init` parameter. Then, the reduction result for each row group is written into the output - * array at the index of an unspecified row in the group. - * - * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a - * reduction functor derived from `reduce_by_row_fn_base` - * @tparam OutputType Type of the reduction results - * @param map The auxiliary map to perform reduction - * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row - * comparisons - * @param num_rows The number of all input rows - * @param has_nulls Indicate whether the input rows has any nulls at any nested levels - * @param has_nested_columns Indicates whether the input table has any nested columns - * @param nulls_equal Flag to specify whether null elements should be considered as equal - * @param nans_equal Flag to specify whether NaN values in floating point column should be - * considered equal. - * @param init The initial value for reduction of each row group - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned vector - * @return A device_uvector containing the reduction results - */ -template -rmm::device_uvector hash_reduce_by_row( - hash_map_type const& map, - std::shared_ptr const preprocessed_input, - size_type num_rows, - cudf::nullate::DYNAMIC has_nulls, - bool has_nested_columns, - null_equality nulls_equal, - nan_equality nans_equal, - ReduceFuncBuilder func_builder, - OutputType init, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto const map_dview = map.get_device_view(); - auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const key_hasher = row_hasher.device_hasher(has_nulls); - auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - - auto reduction_results = rmm::device_uvector(num_rows, stream, mr); - thrust::uninitialized_fill( - rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init); - - auto const reduce_by_row = [&](auto const value_comp) { - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); - } - }; - - if (nans_equal == nan_equality::ALL_EQUAL) { - using nan_equal_comparator = - cudf::experimental::row::equality::nan_equal_physical_equality_comparator; - reduce_by_row(nan_equal_comparator{}); - } else { - using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; - reduce_by_row(nan_unequal_comparator{}); - } - - return reduction_results; -} - -} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp index 4b74d12f306..26b1bec2ced 100644 --- a/cpp/include/cudf/detail/is_element_valid.hpp +++ b/cpp/include/cudf/detail/is_element_valid.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh index 4349e1b70fd..30f36d6a5da 100644 --- a/cpp/include/cudf/detail/iterator.cuh +++ b/cpp/include/cudf/detail/iterator.cuh @@ -38,18 +38,19 @@ #include #include +#include +#include #include #include #include #include -#include - namespace cudf { namespace detail { /** * @brief Convenience wrapper for creating a `thrust::transform_iterator` over a - * `thrust::counting_iterator`. + * `thrust::counting_iterator` within the range [0, INT_MAX]. + * * * Example: * @code{.cpp} @@ -62,14 +63,21 @@ namespace detail { * iter[n] == n * n * @endcode * - * @param start The starting value of the counting iterator + * @param start The starting value of the counting iterator (must be size_type or smaller type). * @param f The unary function to apply to the counting iterator. * @return A transform iterator that applies `f` to a counting iterator */ -template -CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(cudf::size_type start, +template +CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start, UnaryFunction f) { + // Check if the `start` for counting_iterator is of size_type or a smaller integral type + static_assert( + cuda::std::is_integral_v and + cuda::std::numeric_limits::digits <= + cuda::std::numeric_limits::digits, + "The `start` for the counting_transform_iterator must be size_type or smaller type"); + return thrust::make_transform_iterator(thrust::make_counting_iterator(start), f); } diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 327c732716c..025e2ccc3ec 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -25,7 +26,6 @@ #include #include -#include #include #include @@ -165,17 +165,10 @@ size_type inplace_bitmask_binop(Binop op, "Mask pointer cannot be null"); rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref(); - rmm::device_scalar d_counter{0, stream, mr}; - rmm::device_uvector d_masks(masks.size(), stream, mr); - rmm::device_uvector d_begin_bits(masks_begin_bits.size(), stream, mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyDefault, stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(), - masks_begin_bits.data(), - masks_begin_bits.size_bytes(), - cudaMemcpyDefault, - stream.value())); + cudf::detail::device_scalar d_counter{0, stream, mr}; + + auto d_masks = cudf::detail::make_device_uvector_async(masks, stream, mr); + auto d_begin_bits = cudf::detail::make_device_uvector_async(masks_begin_bits, stream, mr); auto constexpr block_size = 256; cudf::detail::grid_1d config(dest_mask.size(), block_size); diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp index 80a4460023f..4295f5e6ddd 100644 --- a/cpp/include/cudf/detail/tdigest/tdigest.hpp +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -143,28 +143,30 @@ std::unique_ptr make_tdigest_column(size_type num_rows, rmm::device_async_resource_ref mr); /** - * @brief Create an empty tdigest column. + * @brief Create a tdigest column of empty tdigests. * - * An empty tdigest column contains a single row of length 0 + * The column created contains the specified number of rows of empty tdigests. * + * @param num_rows The number of rows in the output column. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns An empty tdigest column. + * @returns A tdigest column of empty clusters. */ CUDF_EXPORT -std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr make_empty_tdigests_column(size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** - * @brief Create an empty tdigest scalar. + * @brief Create a scalar of an empty tdigest cluster. * - * An empty tdigest scalar is a struct_scalar that contains a single row of length 0 + * The returned scalar is a struct_scalar that contains a single row of an empty cluster. * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns An empty tdigest scalar. + * @returns A scalar of an empty tdigest cluster. */ std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp index 18b1e9b2d2e..0f852db0c54 100644 --- a/cpp/include/cudf/detail/unary.hpp +++ b/cpp/include/cudf/detail/unary.hpp @@ -59,7 +59,7 @@ std::unique_ptr true_if(InputIterator begin, auto output_mutable_view = output->mutable_view(); auto output_data = output_mutable_view.data(); - thrust::transform(rmm::exec_policy(stream), begin, end, output_data, p); + thrust::transform(rmm::exec_policy_nosync(stream), begin, end, output_data, p); return output; } diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp new file mode 100644 index 00000000000..ed0ab9e6e5b --- /dev/null +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +/** + * @brief A helper function that copies a vector of vectors from source to destination addresses in + * a batched manner. + * + * @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator + * @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator + * @tparam SizeIterator **[inferred]** The type of device-accessible buffer size iterator + * + * @param src_iter Device-accessible iterator to source addresses + * @param dst_iter Device-accessible iterator to destination addresses + * @param size_iter Device-accessible iterator to the buffer sizes (in bytes) + * @param num_buffs Number of buffers to be copied + * @param stream CUDA stream to use + */ +template +void batched_memcpy_async(SrcIterator src_iter, + DstIterator dst_iter, + SizeIterator size_iter, + size_t num_buffs, + rmm::cuda_stream_view stream) +{ + size_t temp_storage_bytes = 0; + cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value()); + + rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()}; + + cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_iter, + dst_iter, + size_iter, + num_buffs, + stream.value()); +} + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp similarity index 95% rename from cpp/include/cudf/io/detail/batched_memset.hpp rename to cpp/include/cudf/detail/utilities/batched_memset.hpp index 1c74be4a9fe..78be5b91248 100644 --- a/cpp/include/cudf/io/detail/batched_memset.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp @@ -28,7 +28,7 @@ #include namespace CUDF_EXPORT cudf { -namespace io::detail { +namespace detail { /** * @brief A helper function that takes in a vector of device spans and memsets them to the @@ -53,8 +53,8 @@ void batched_memset(std::vector> const& bufs, cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref()); // get a vector with the sizes of all buffers - auto sizes = cudf::detail::make_counting_transform_iterator( - static_cast(0), + auto sizes = thrust::make_transform_iterator( + thrust::counting_iterator(0), cuda::proclaim_return_type( [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); })); @@ -78,5 +78,5 @@ void batched_memset(std::vector> const& bufs, d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream); } -} // namespace io::detail +} // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 632d5a732ec..4f0c52c5954 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -25,33 +26,82 @@ namespace detail { enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; +void cuda_memcpy_async_impl( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); + /** - * @brief Asynchronously copies data between the host and device. + * @brief Asynchronously copies data from host to device memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination device memory + * @param src Source host memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy_async( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = src.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} /** - * @brief Synchronously copies data between the host and device. + * @brief Asynchronously copies data from device to host memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination host memory + * @param src Source device memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = dst.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} + +/** + * @brief Synchronously copies data from host to device memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination device memory + * @param src Source host memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} + +/** + * @brief Synchronously copies data from device to host memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination host memory + * @param src Source device memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} } // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index ecb8f910463..3f6ad7b7b1d 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -183,7 +183,7 @@ class rmm_host_allocator { */ inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); } - bool is_device_accessible() const { return _is_device_accessible; } + [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; } private: rmm::host_async_resource_ref mr; diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp index 8c1c3c28df8..e7643eb44bd 100644 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ b/cpp/include/cudf/detail/utilities/logger.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ #include // Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::logger(), __VA_ARGS__) +#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 953ae5b9308..1f1e7a2db77 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -101,12 +101,7 @@ rmm::device_uvector make_device_uvector_async(host_span source_data, rmm::device_async_resource_ref mr) { rmm::device_uvector ret(source_data.size(), stream, mr); - auto const is_pinned = source_data.is_device_accessible(); - cuda_memcpy_async(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async(ret, source_data, stream); return ret; } @@ -405,13 +400,8 @@ host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str template host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - auto result = make_host_vector(v.size(), stream); - auto const is_pinned = result.get_allocator().is_device_accessible(); - cuda_memcpy_async(result.data(), - v.data(), - v.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + auto result = make_host_vector(v.size(), stream); + cuda_memcpy_async(result, v, stream); return result; } diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index cfb2e70bfed..af182b69c3a 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -25,7 +26,6 @@ #include #include -#include #include @@ -101,7 +101,7 @@ std::pair valid_if(InputIterator begin, size_type null_count{0}; if (size > 0) { - rmm::device_scalar valid_count{0, stream}; + cudf::detail::device_scalar valid_count{0, stream}; constexpr size_type block_size{256}; grid_1d grid{size, block_size}; diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index dc822fee38b..0a799f27d00 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include /** @@ -47,7 +46,7 @@ class dictionary_column_view : private column_view { dictionary_column_view(column_view const& dictionary_column); dictionary_column_view(dictionary_column_view&&) = default; ///< Move constructor dictionary_column_view(dictionary_column_view const&) = default; ///< Copy constructor - ~dictionary_column_view() = default; + ~dictionary_column_view() override = default; /** * @brief Move assignment operator diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp similarity index 99% rename from cpp/include/cudf/fixed_point/floating_conversion.hpp rename to cpp/include/cudf/fixed_point/detail/floating_conversion.hpp index f0d50edccd1..fce08b4a5c4 100644 --- a/cpp/include/cudf/fixed_point/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp @@ -26,14 +26,6 @@ #include namespace CUDF_EXPORT numeric { - -/** - * @addtogroup floating_conversion - * @{ - * @file - * @brief fixed_point <--> floating-point conversion functions. - */ - namespace detail { /** @@ -1141,6 +1133,4 @@ CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& val } } // namespace detail - -/** @} */ // end of group } // namespace CUDF_EXPORT numeric diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 11c778408fe..c9df02f167a 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -36,7 +36,7 @@ namespace CUDF_EXPORT cudf { namespace groupby { namespace detail { namespace sort { -class sort_groupby_helper; +struct sort_groupby_helper; } // namespace sort } // namespace detail diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 0c5327edb91..307a52cd242 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -22,26 +22,27 @@ namespace CUDF_EXPORT cudf { -/** - * @addtogroup column_hash - * @{ - * @file - */ - /** * @brief Type of hash value - * + * @ingroup column_hash */ using hash_value_type = uint32_t; /** * @brief The default seed value for hash functions + * @ingroup column_hash */ static constexpr uint32_t DEFAULT_HASH_SEED = 0; //! Hash APIs namespace hashing { +/** + * @addtogroup column_hash + * @{ + * @file + */ + /** * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table * @@ -183,7 +184,8 @@ std::unique_ptr xxhash_64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** @} */ // end of group + } // namespace hashing -/** @} */ // end of group } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/hashing/detail/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh index 3489fdeccee..ea1accc62a4 100644 --- a/cpp/include/cudf/hashing/detail/helper_functions.cuh +++ b/cpp/include/cudf/hashing/detail/helper_functions.cuh @@ -47,197 +47,3 @@ inline size_t compute_hash_table_size(cudf::size_type num_keys_to_insert, return hash_table_size; } - -template -__forceinline__ __device__ pair_type load_pair_vectorized(pair_type const* __restrict__ const ptr) -{ - if (sizeof(uint4) == sizeof(pair_type)) { - union pair_type2vec_type { - uint4 vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0, 0, 0, 0}; - converter.vec_val = *reinterpret_cast(ptr); - return converter.pair_val; - } else if (sizeof(uint2) == sizeof(pair_type)) { - union pair_type2vec_type { - uint2 vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0, 0}; - converter.vec_val = *reinterpret_cast(ptr); - return converter.pair_val; - } else if (sizeof(int) == sizeof(pair_type)) { - union pair_type2vec_type { - int vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0}; - converter.vec_val = *reinterpret_cast(ptr); - return converter.pair_val; - } else if (sizeof(short) == sizeof(pair_type)) { - union pair_type2vec_type { - short vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0}; - converter.vec_val = *reinterpret_cast(ptr); - return converter.pair_val; - } else { - return *ptr; - } -} - -template -__forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ const ptr, - pair_type const val) -{ - if (sizeof(uint4) == sizeof(pair_type)) { - union pair_type2vec_type { - uint4 vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0, 0, 0, 0}; - converter.pair_val = val; - *reinterpret_cast(ptr) = converter.vec_val; - } else if (sizeof(uint2) == sizeof(pair_type)) { - union pair_type2vec_type { - uint2 vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0, 0}; - converter.pair_val = val; - *reinterpret_cast(ptr) = converter.vec_val; - } else if (sizeof(int) == sizeof(pair_type)) { - union pair_type2vec_type { - int vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0}; - converter.pair_val = val; - *reinterpret_cast(ptr) = converter.vec_val; - } else if (sizeof(short) == sizeof(pair_type)) { - union pair_type2vec_type { - short vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0}; - converter.pair_val = val; - *reinterpret_cast(ptr) = converter.vec_val; - } else { - *ptr = val; - } -} - -template -CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values, - size_type const n, - key_type const key_val, - elem_type const elem_val) -{ - size_type const idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < n) { - store_pair_vectorized(hashtbl_values + idx, thrust::make_pair(key_val, elem_val)); - } -} - -template -struct equal_to { - using result_type = bool; - using first_argument_type = T; - using second_argument_type = T; - __forceinline__ __host__ __device__ constexpr bool operator()( - first_argument_type const& lhs, second_argument_type const& rhs) const - { - return lhs == rhs; - } -}; - -template -class cycle_iterator_adapter { - public: - using value_type = typename std::iterator_traits::value_type; - using difference_type = typename std::iterator_traits::difference_type; - using pointer = typename std::iterator_traits::pointer; - using reference = typename std::iterator_traits::reference; - using iterator_type = Iterator; - - cycle_iterator_adapter() = delete; - - __host__ __device__ explicit cycle_iterator_adapter(iterator_type const& begin, - iterator_type const& end, - iterator_type const& current) - : m_begin(begin), m_end(end), m_current(current) - { - } - - __host__ __device__ cycle_iterator_adapter& operator++() - { - if (m_end == (m_current + 1)) - m_current = m_begin; - else - ++m_current; - return *this; - } - - __host__ __device__ cycle_iterator_adapter const& operator++() const - { - if (m_end == (m_current + 1)) - m_current = m_begin; - else - ++m_current; - return *this; - } - - __host__ __device__ cycle_iterator_adapter& operator++(int) - { - cycle_iterator_adapter old(m_begin, m_end, m_current); - if (m_end == (m_current + 1)) - m_current = m_begin; - else - ++m_current; - return old; - } - - __host__ __device__ cycle_iterator_adapter const& operator++(int) const - { - cycle_iterator_adapter old(m_begin, m_end, m_current); - if (m_end == (m_current + 1)) - m_current = m_begin; - else - ++m_current; - return old; - } - - __host__ __device__ bool equal(cycle_iterator_adapter const& other) const - { - return m_current == other.m_current && m_begin == other.m_begin && m_end == other.m_end; - } - - __host__ __device__ reference& operator*() { return *m_current; } - - __host__ __device__ reference const& operator*() const { return *m_current; } - - __host__ __device__ const pointer operator->() const { return m_current.operator->(); } - - __host__ __device__ pointer operator->() { return m_current; } - - private: - iterator_type m_current; - iterator_type m_begin; - iterator_type m_end; -}; - -template -__host__ __device__ bool operator==(cycle_iterator_adapter const& lhs, - cycle_iterator_adapter const& rhs) -{ - return lhs.equal(rhs); -} - -template -__host__ __device__ bool operator!=(cycle_iterator_adapter const& lhs, - cycle_iterator_adapter const& rhs) -{ - return !lhs.equal(rhs); -} diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp index 1827ba0e3e6..070b59a117c 100644 --- a/cpp/include/cudf/io/config_utils.hpp +++ b/cpp/include/cudf/io/config_utils.hpp @@ -18,7 +18,8 @@ #include namespace CUDF_EXPORT cudf { -namespace io::cufile_integration { +namespace io { +namespace cufile_integration { /** * @brief Returns true if cuFile and its compatibility mode are enabled. @@ -35,9 +36,18 @@ bool is_gds_enabled(); */ bool is_kvikio_enabled(); -} // namespace io::cufile_integration +/** + * @brief Set KvikIO parameters, including: + * - Compatibility mode, according to the environment variable KVIKIO_COMPAT_MODE. If + * KVIKIO_COMPAT_MODE is not set, enable it by default, which enforces the use of POSIX I/O. + * - Thread pool size, according to the environment variable KVIKIO_NTHREADS. If KVIKIO_NTHREADS is + * not set, use 4 threads by default. + */ +void set_up_kvikio(); + +} // namespace cufile_integration -namespace io::nvcomp_integration { +namespace nvcomp_integration { /** * @brief Returns true if all nvCOMP uses are enabled. @@ -49,5 +59,6 @@ bool is_all_enabled(); */ bool is_stable_enabled(); -} // namespace io::nvcomp_integration +} // namespace nvcomp_integration +} // namespace io } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index dae056ef157..9b2de7c72ec 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1362,7 +1362,7 @@ table_with_metadata read_csv( */ /** - *@brief Builder to build options for `writer_csv()`. + *@brief Builder to build options for `write_csv()`. */ class csv_writer_options_builder; diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index b12fbe39a57..7bec40893fd 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -79,21 +79,28 @@ class datasource { template static std::unique_ptr create(Container&& data_owner) { - return std::make_unique>(std::move(data_owner)); + return std::make_unique>(std::forward(data_owner)); } }; /** * @brief Creates a source from a file path. * + * Parameters `offset` and `max_size_estimate` are hints to the `datasource` implementation about + * the expected range of the data that will be read. The implementation may use these hints to + * optimize the read operation. These parameters are usually based on the byte range option. In + * this case, `max_size_estimate` can include padding after the byte range, to include additional + * data that may be needed for processing. + * * @param[in] filepath Path to the file to use - * @param[in] offset Bytes from the start of the file (the default is zero) - * @param[in] size Bytes from the offset; use zero for entire file (the default is zero) + * @param[in] offset Starting byte offset from which data will be read (the default is zero) + * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is + * zero, which means the whole file after `offset`) * @return Constructed datasource object */ static std::unique_ptr create(std::string const& filepath, - size_t offset = 0, - size_t size = 0); + size_t offset = 0, + size_t max_size_estimate = 0); /** * @brief Creates a source from a host memory buffer. @@ -328,13 +335,19 @@ class datasource { template class owning_buffer : public buffer { public: + // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue + // reference). + static_assert(std::is_rvalue_reference_v, + "The container argument passed to the constructor must be an rvalue."); + /** * @brief Moves the input container into the newly created object. * - * @param data_owner The container to construct the buffer from (ownership is transferred) + * @param moved_data_owner The container to construct the buffer from. Callers should explicitly + * pass std::move(data_owner) to this function to transfer the ownership. */ - owning_buffer(Container&& data_owner) - : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size()) + owning_buffer(Container&& moved_data_owner) + : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size()) { } @@ -342,12 +355,13 @@ class datasource { * @brief Moves the input container into the newly created object, and exposes a subspan of the * buffer. * - * @param data_owner The container to construct the buffer from (ownership is transferred) + * @param moved_data_owner The container to construct the buffer from. Callers should explicitly + * pass std::move(data_owner) to this function to transfer the ownership. * @param data_ptr Pointer to the start of the subspan * @param size The size of the subspan */ - owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size) - : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size) + owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size) + : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size) { } diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 940d03cdb41..2e2ac43d6fe 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -57,11 +57,13 @@ void write_json(data_sink* sink, /** * @brief Normalize single quotes to double quotes using FST * - * @param indata Input device buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation + * @param indata Input device buffer + * @param delimiter Line-separating delimiter character in JSONL inputs + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation */ void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 6798557e14e..0c3244a1c75 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -18,6 +18,7 @@ #include "types.hpp" +#include #include #include #include @@ -53,6 +54,11 @@ struct schema_element { * @brief Allows specifying this column's child columns target type */ std::map child_types; + + /** + * @brief Allows specifying the order of the columns + */ + std::optional> column_order; }; /** @@ -87,13 +93,18 @@ enum class json_recovery_mode_t { * | `chunksize` | use `byte_range_xxx` for chunking instead | */ class json_reader_options { + public: + using dtype_variant = + std::variant, + std::map, + std::map, + schema_element>; ///< Variant type holding dtypes information for the columns + + private: source_info _source; // Data types of the column; empty to infer dtypes - std::variant, - std::map, - std::map> - _dtypes; + dtype_variant _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -116,9 +127,6 @@ class json_reader_options { // Whether to parse dates as DD/MM versus MM/DD bool _dayfirst = false; - // Whether to use the legacy reader - bool _legacy = false; - // Whether to keep the quote characters of string values bool _keep_quotes = false; @@ -181,13 +189,7 @@ class json_reader_options { * * @returns Data types of the columns */ - [[nodiscard]] std::variant, - std::map, - std::map> const& - get_dtypes() const - { - return _dtypes; - } + [[nodiscard]] dtype_variant const& get_dtypes() const { return _dtypes; } /** * @brief Returns compression format of the source. @@ -231,7 +233,11 @@ class json_reader_options { */ [[nodiscard]] size_t get_byte_range_padding() const { - auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_columns = + std::visit(cudf::detail::visitor_overload{ + [](auto const& dtypes) { return dtypes.size(); }, + [](schema_element const& dtypes) { return dtypes.child_types.size(); }}, + _dtypes); auto const max_row_bytes = 16 * 1024; // 16KB auto const column_bytes = 64; @@ -393,6 +399,14 @@ class json_reader_options { */ void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** + * @brief Set data types for a potentially nested column hierarchy. + * + * @param types schema element with column names and column order to support arbitrary nesting of + * data types + */ + void set_dtypes(schema_element types); + /** * @brief Set the compression type. * @@ -627,6 +641,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set data types for columns to be read. + * + * @param types Struct schema_element with Column name -> schema_element with map and order + * @return this for chaining + */ + json_reader_options_builder& dtypes(schema_element types) + { + options.set_dtypes(std::move(types)); + return *this; + } + /** * @brief Set the compression type. * @@ -920,6 +946,8 @@ class json_writer_options_builder; class json_writer_options { // Specify the sink to use for writer output sink_info _sink; + // Specify the compression format of the sink + compression_type _compression = compression_type::NONE; // maximum number of rows to write in each chunk (limits memory use) size_type _rows_per_chunk = std::numeric_limits::max(); // Set of columns to output @@ -996,6 +1024,13 @@ class json_writer_options { */ [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; } + /** + * @brief Returns compression type used for sink + * + * @return compression type for sink + */ + [[nodiscard]] compression_type get_compression() const { return _compression; } + /** * @brief Whether to output nulls as 'null'. * @@ -1040,6 +1075,13 @@ class json_writer_options { */ void set_table(table_view tbl) { _table = tbl; } + /** + * @brief Sets compression type to be used + * + * @param comptype Compression type for sink + */ + void set_compression(compression_type comptype) { _compression = comptype; } + /** * @brief Sets metadata. * @@ -1127,6 +1169,18 @@ class json_writer_options_builder { return *this; } + /** + * @brief Sets compression type of output sink + * + * @param comptype Compression type used + * @return this for chaining + */ + json_writer_options_builder& compression(compression_type comptype) + { + options._compression = comptype; + return *this; + } + /** * @brief Sets optional metadata (with column names). * diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ee03a382bec..bfe76d5690c 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -1200,7 +1200,7 @@ class parquet_writer_options : public parquet_writer_options_base { * @param sink The sink used for writer output * @param table Table to be written to output */ - explicit parquet_writer_options(sink_info const& sink, table_view const& table); + explicit parquet_writer_options(sink_info const& sink, table_view table); public: /** diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp index 11eb4518210..5659f86b0c4 100644 --- a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp +++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp @@ -16,16 +16,10 @@ #pragma once -#include #include #include -#include - -#include -#include #include -#include namespace CUDF_EXPORT cudf { namespace io::text::detail::bgzip { diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index a590eb27511..afefd04d4fa 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -573,7 +573,7 @@ class distinct_hash_join { * Result: {{1}, {0}} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -620,7 +620,7 @@ conditional_inner_join(table_view const& left, * Result: {{0, 1, 2}, {None, 0, None}} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -666,7 +666,7 @@ conditional_left_join(table_view const& left, * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -705,7 +705,7 @@ conditional_full_join(table_view const& left, * Result: {1} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -746,7 +746,7 @@ std::unique_ptr> conditional_left_semi_join( * Result: {0, 2} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -793,7 +793,7 @@ std::unique_ptr> conditional_left_anti_join( * Result: {{1}, {0}} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -855,7 +855,7 @@ mixed_inner_join( * Result: {{0, 1, 2}, {None, 0, None}} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -917,7 +917,7 @@ mixed_left_join( * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -972,7 +972,7 @@ mixed_full_join( * Result: {1} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -1022,7 +1022,7 @@ std::unique_ptr> mixed_left_semi_join( * Result: {0, 2} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -1061,7 +1061,7 @@ std::unique_ptr> mixed_left_anti_join( * choose a suitable compare_nulls value AND use appropriate null-safe * operators in the expression. * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -1103,7 +1103,7 @@ std::pair>> mixed_in * choose a suitable compare_nulls value AND use appropriate null-safe * operators in the expression. * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -1142,7 +1142,7 @@ std::pair>> mixed_le * If the provided predicate returns NULL for a pair of rows * (left, right), that pair is not included in the output. * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -1167,7 +1167,7 @@ std::size_t conditional_inner_join_size( * If the provided predicate returns NULL for a pair of rows * (left, right), that pair is not included in the output. * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -1192,7 +1192,7 @@ std::size_t conditional_left_join_size( * If the provided predicate returns NULL for a pair of rows * (left, right), that pair is not included in the output. * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -1217,7 +1217,7 @@ std::size_t conditional_left_semi_join_size( * If the provided predicate returns NULL for a pair of rows * (left, right), that pair is not included in the output. * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp index 96ee30dd261..f45da8e8d8d 100644 --- a/cpp/include/cudf/lists/detail/dremel.hpp +++ b/cpp/include/cudf/lists/detail/dremel.hpp @@ -58,7 +58,7 @@ struct dremel_data { }; /** - * @brief Get the dremel offsets and repetition and definition levels for a LIST column + * @brief Get the dremel offsets, repetition levels, and definition levels for a LIST column * * Dremel is a query system created by Google for ad hoc data analysis. The Dremel engine is * described in depth in the paper "Dremel: Interactive Analysis of Web-Scale @@ -74,7 +74,7 @@ struct dremel_data { * * http://www.goldsborough.me/distributed-systems/2019/05/18/21-09-00-a_look_at_dremel/ * https://akshays-blog.medium.com/wrapping-head-around-repetition-and-definition-levels-in-dremel-powering-bigquery-c1a33c9695da - * https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet + * https://blog.x.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet * * The remainder of this documentation assumes familiarity with the Dremel concepts. * @@ -102,16 +102,17 @@ struct dremel_data { * ``` * We can represent it in cudf format with two level of offsets like this: * ``` - * Level 0 offsets = {0, 0, 3, 5, 6} + * Level 0 offsets = {0, 0, 3, 4} * Level 1 offsets = {0, 0, 3, 5, 5} * Values = {1, 2, 3, 4, 5} * ``` - * The desired result of this function is the repetition and definition level values that - * correspond to the data values: + * This function returns the dremel offsets, repetition levels, and definition level + * values that correspond to the data values: * ``` - * col = {[], [[], [1, 2, 3], [4, 5]], [[]]} - * def = { 0 1, 2, 2, 2, 2, 2, 1 } - * rep = { 0, 0, 0, 2, 2, 1, 2, 0 } + * col = {[], [[], [1, 2, 3], [4, 5]], [[]]} + * dremel_offsets = { 0, 1, 7, 8} + * def_levels = { 0, 1, 2, 2, 2, 2, 2, 1 } + * rep_levels = { 0, 0, 1, 2, 2, 1, 2, 0 } * ``` * * Since repetition and definition levels arrays contain a value for each empty list, the size of diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index b117a871b64..d7057cfea7e 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -48,7 +48,7 @@ class lists_column_view : private column_view { lists_column_view(column_view const& lists_column); lists_column_view(lists_column_view&&) = default; ///< Move constructor lists_column_view(lists_column_view const&) = default; ///< Copy constructor - ~lists_column_view() = default; + ~lists_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index 385da993262..f9a68e4fffc 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -70,6 +70,7 @@ enum class hash_id { * @param partition_map Non-nullable column of integer values that map each row * in `t` to it's partition. * @param num_partitions The total number of partitions + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return Pair containing the reordered table and vector of `num_partitions + * 1` offsets to each partition such that the size of partition `i` is @@ -79,6 +80,7 @@ std::pair, std::vector> partition( table_view const& t, column_view const& partition_map, size_type num_partitions, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -242,6 +244,7 @@ std::pair, std::vector> hash_partition( * @param[in] input The input table to be round-robin partitioned * @param[in] num_partitions Number of partitions for the table * @param[in] start_partition Index of the 1st partition + * @param[in] stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate the returned table's device memory * * @return A std::pair consisting of a unique_ptr to the partitioned table @@ -251,6 +254,7 @@ std::pair, std::vector> round_robi table_view const& input, cudf::size_type num_partitions, cudf::size_type start_partition = 0, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp index f6bae170f03..f0039734519 100644 --- a/cpp/include/cudf/quantiles.hpp +++ b/cpp/include/cudf/quantiles.hpp @@ -48,6 +48,7 @@ namespace CUDF_EXPORT cudf { * ignored. * @param[in] exact If true, returns doubles. * If false, returns same type as input. + * @param[in] stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate the returned column's device memory * @returns Column of specified quantiles, with nulls for indeterminable values @@ -59,6 +60,7 @@ std::unique_ptr quantile( interpolation interp = interpolation::LINEAR, column_view const& ordered_indices = {}, bool exact = true, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -85,6 +87,7 @@ std::unique_ptr quantile( * @param is_input_sorted Indicates if the input has been pre-sorted * @param column_order The desired sort order for each column * @param null_precedence The desired order of null compared to other elements + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @returns Table of specified quantiles, with nulls for indeterminable values @@ -98,6 +101,7 @@ std::unique_ptr quantiles( cudf::sorted is_input_sorted = sorted::NO, std::vector const& column_order = {}, std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -114,6 +118,7 @@ std::unique_ptr
quantiles( * * @param input tdigest input data. One tdigest per row * @param percentiles Desired percentiles in range [0, 1] + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device * memory * @@ -125,6 +130,7 @@ std::unique_ptr
quantiles( std::unique_ptr percentile_approx( tdigest::tdigest_column_view const& input, column_view const& percentiles, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh index 4cf8564ab3a..5694362af8f 100644 --- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh +++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh @@ -31,17 +31,41 @@ namespace detail { // intermediate data structure to compute `var`, `std` template struct var_std { - ResultType value; /// the value - ResultType value_squared; /// the value of squared - - CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0) - : value(_value), value_squared(_value_squared){}; + // Uses the pairwise approach of Chan, Golub, and LeVeque, + // _Algorithms for computing the sample variance: analysis and + // recommendations_ (1983) + // https://doi.org/10.1080/00031305.1983.10483115 + // Also http://www.cs.yale.edu/publications/techreports/tr222.pdf + // This is a modification of Youngs and Cramer's online approach. + ResultType running_sum; + ResultType running_square_deviations; + size_type count; + + CUDF_HOST_DEVICE inline var_std(ResultType t = 0, ResultType s = 0, size_type n = 0) + : running_sum(t), running_square_deviations(s), count(n){}; using this_t = var_std; CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const { - return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared)); + // Updates as per equations 1.5a and 1.5b in the paper + // T_{1,m+n} = T_{1,m} + T_{m+1,n+1} + // S_{1,m+n} = S_{1,m} + S_{m+1,n+1} + m/(n(m+n)) * (n/m T_{1,m} - T_{m+1,n+1})**2 + // Here the first m samples are in this, the remaining n samples are in rhs. + auto const m = this->count; + auto const n = rhs.count; + // Avoid division by zero. + if (m == 0) { return rhs; } + if (n == 0) { return *this; } + auto const tm = this->running_sum; + auto const tn = rhs.running_sum; + auto const sm = this->running_square_deviations; + auto const sn = rhs.running_square_deviations; + auto const tmn = tm + tn; + auto const diff = ((static_cast(n) / m) * tm) - tn; + // Computing m/n(m+n) as m/n/(m+n) to avoid integer overflow + auto const smn = sm + sn + ((static_cast(m) / n) / (m + n)) * diff * diff; + return {tmn, smn, m + n}; }; }; @@ -50,10 +74,7 @@ template struct transformer_var_std { using OutputType = var_std; - CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) - { - return OutputType(value, value * value); - }; + CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) { return {value, 0, 1}; }; }; // ------------------------------------------------------------------------ @@ -257,12 +278,7 @@ struct variance : public compound_op { cudf::size_type const& count, cudf::size_type const& ddof) { - ResultType mean = input.value / count; - ResultType asum = input.value_squared; - cudf::size_type div = count - ddof; - ResultType var = asum / div - ((mean * mean) * count) / div; - - return var; + return input.running_square_deviations / (count - ddof); }; }; }; diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp index ba56ff34b97..158e6df7e5f 100644 --- a/cpp/include/cudf/round.hpp +++ b/cpp/include/cudf/round.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -66,6 +67,7 @@ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN }; * @param decimal_places Number of decimal places to round to (default 0). If negative, this * specifies the number of positions to the left of the decimal point. * @param method Rounding method + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @return Column with each of the values rounded @@ -74,6 +76,7 @@ std::unique_ptr round( column_view const& input, int32_t decimal_places = 0, rounding_method method = rounding_method::HALF_UP, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index e8a498afc09..360dde11fc0 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -47,6 +48,7 @@ namespace CUDF_EXPORT cudf { */ class scalar { public: + scalar() = delete; virtual ~scalar() = default; scalar& operator=(scalar const& other) = delete; scalar& operator=(scalar&& other) = delete; @@ -93,10 +95,8 @@ class scalar { [[nodiscard]] bool const* validity_data() const; protected: - data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar - rmm::device_scalar _is_valid; ///< Device bool signifying validity - - scalar() = delete; + data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar + cudf::detail::device_scalar _is_valid; ///< Device bool signifying validity /** * @brief Move constructor for scalar. @@ -145,6 +145,7 @@ class fixed_width_scalar : public scalar { public: using value_type = T; ///< Type of the value held by the scalar. + fixed_width_scalar() = delete; ~fixed_width_scalar() override = default; /** @@ -203,8 +204,6 @@ class fixed_width_scalar : public scalar { protected: rmm::device_scalar _data; ///< device memory containing the value - fixed_width_scalar() = delete; - /** * @brief Construct a new fixed width scalar object. * diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 3ebe5cb53e9..f229facca08 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -30,7 +30,7 @@ namespace strings { */ /** - * @brief Returns a boolean column identifying strings entries in which all + * @brief Returns a boolean column identifying string entries where all * characters are of the type specified. * * The output row entry will be set to false if the corresponding string element @@ -105,7 +105,8 @@ std::unique_ptr all_characters_of_type( * `types_to_remove` will be filtered. * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches - * @return New column of boolean results for each string + * @return New strings column with the characters of specified types filtered out and replaced by + * the specified replacement string */ std::unique_ptr filter_characters_of_type( strings_column_view const& input, diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp index d6e87f9d543..febc63d8779 100644 --- a/cpp/include/cudf/strings/convert/convert_urls.hpp +++ b/cpp/include/cudf/strings/convert/convert_urls.hpp @@ -28,7 +28,7 @@ namespace strings { */ /** - * @brief Decodes each string using URL encoding. + * @brief Encodes each string using URL encoding. * * Converts mostly non-ascii characters and control characters into UTF-8 hex code-points * prefixed with '%'. For example, the space character must be converted to characters '%20' where @@ -49,7 +49,7 @@ std::unique_ptr url_encode( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Encodes each string using URL encoding. + * @brief Decodes each string using URL encoding. * * Converts all character sequences starting with '%' into character code-points * interpreting the 2 following characters as hex values to create the code-point. diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp index 5d6aff28826..6460d4f43ff 100644 --- a/cpp/include/cudf/strings/detail/char_tables.hpp +++ b/cpp/include/cudf/strings/detail/char_tables.hpp @@ -74,9 +74,9 @@ character_cases_table_type const* get_character_cases_table(); */ struct special_case_mapping { uint16_t num_upper_chars; - uint16_t upper[3]; + uint16_t upper[3]; // NOLINT uint16_t num_lower_chars; - uint16_t lower[3]; + uint16_t lower[3]; // NOLINT }; /** diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 1283226879b..de2f1770e28 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,8 @@ #include #include +#include +#include #include #include @@ -38,6 +41,62 @@ namespace cudf { namespace strings { namespace detail { +/** + * @brief Gather characters to create a strings column using the given string-index pair iterator + * + * @tparam IndexPairIterator iterator over type `pair` values + * + * @param offsets The offsets for the output strings column + * @param chars_size The size (in bytes) of the chars data + * @param begin Iterator to the first string-index pair + * @param strings_count The number of strings + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return An array of chars gathered from the input string-index pair iterator + */ +template +rmm::device_uvector make_chars_buffer(column_view const& offsets, + int64_t chars_size, + IndexPairIterator begin, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto chars_data = rmm::device_uvector(chars_size, stream, mr); + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets); + + auto const src_ptrs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type([begin] __device__(uint32_t idx) { + // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586), + // we have to use `const_cast` to remove `const` qualifier from the source pointer. + // This should be fine as long as we only read but not write anything to the source. + return reinterpret_cast(const_cast(begin[idx].first)); + })); + auto const src_sizes = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [begin] __device__(uint32_t idx) { return begin[idx].second; })); + auto const dst_ptrs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type([offsets = d_offsets, output = chars_data.data()] __device__( + uint32_t idx) { return output + offsets[idx]; })); + + size_t temp_storage_bytes = 0; + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, strings_count, stream.value())); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_ptrs, + dst_ptrs, + src_sizes, + strings_count, + stream.value())); + + return chars_data; +} + /** * @brief Create an offsets column to be a child of a compound column * diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 6b1b453a752..03240f418fe 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -49,16 +49,6 @@ namespace detail { */ using string_index_pair = thrust::pair; -/** - * @brief Average string byte-length threshold for deciding character-level - * vs. row-level parallel algorithm. - * - * This value was determined by running the factory_benchmark against different - * string lengths and observing the point where the performance is faster for - * long strings. - */ -constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64; - /** * @brief Create a strings-type column from iterators of pointer/size pairs * @@ -88,8 +78,6 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto const d_offsets = - cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // create null mask auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; @@ -99,38 +87,8 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr}; // build chars column - auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] { - auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); - // use a character-parallel kernel for long string lengths - if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { - auto const str_begin = thrust::make_transform_iterator( - begin, cuda::proclaim_return_type([] __device__(auto ip) { - return string_view{ip.first, ip.second}; - })); - - return gather_chars(str_begin, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_offsets, - bytes, - stream, - mr); - } else { - // this approach is 2-3x faster for a large number of smaller string lengths - auto chars_data = rmm::device_uvector(bytes, stream, mr); - auto d_chars = chars_data.data(); - auto copy_chars = [d_chars] __device__(auto item) { - string_index_pair const str = thrust::get<0>(item); - int64_t const offset = thrust::get<1>(item); - if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); - }; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)), - strings_count, - copy_chars); - return chars_data; - } - }(); + auto chars_data = + make_chars_buffer(offsets_column->view(), bytes, begin, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp index 1fe446db8da..e090766dd07 100644 --- a/cpp/include/cudf/strings/find_multiple.hpp +++ b/cpp/include/cudf/strings/find_multiple.hpp @@ -28,8 +28,42 @@ namespace strings { */ /** - * @brief Returns a lists column with character position values where each - * of the target strings are found in each string. + * @brief Searches for the given target strings within each string in the provided column + * + * Each column in the result table corresponds to the result for the target string at the same + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st, + * etc. + * + * If the target is not found for a string, false is returned for that entry in the output column. + * If the target is an empty string, true is returned for all non-null entries in the output column. + * + * Any null input strings return corresponding null entries in the output columns. + * + * @code{.pseudo} + * input = ["a", "b", "c"] + * targets = ["a", "c"] + * output is a table with two boolean columns: + * column 0: [true, false, false] + * column 1: [false, false, true] + * @endcode + * + * @throw std::invalid_argument if `targets` is empty or contains nulls + * + * @param input Strings instance for this operation + * @param targets UTF-8 encoded strings to search for in each string in `input` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Table of BOOL8 columns + */ +std::unique_ptr
contains_multiple( + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Searches for the given target strings within each string in the provided column + * and returns the position the targets were found * * The size of the output column is `input.size()`. * Each row of the output column is of size `targets.size()`. @@ -45,7 +79,7 @@ namespace strings { * [-1,-1, 1 ]} // for "def": "a" and "b" not found, "e" at pos 1 * @endcode * - * @throw cudf::logic_error if `targets` is empty or contains nulls + * @throw std::invalid_argument if `targets` is empty or contains nulls * * @param input Strings instance for this operation * @param targets Strings to search for in each string diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index c6b9bc7e58a..867764b6d9a 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -66,6 +66,35 @@ std::unique_ptr findall( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the starting character index of the first match for the given pattern + * in each row of the input column + * + * @code{.pseudo} + * Example: + * s = ["bunny", "rabbit", "hare", "dog"] + * p = regex_program::create("[be]") + * r = find_re(s, p) + * r is now [0, 2, 3, -1] + * @endcode + * + * A null output row occurs if the corresponding input row is null. + * A -1 is returned for rows that do not contain a match. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param input Strings instance for this operation + * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of integers + */ +std::unique_ptr find_re( + strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of doxygen group } // namespace strings } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index 9da859d9c87..1bf1c26f471 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -54,6 +54,8 @@ struct regex_program { regex_flags flags = regex_flags::DEFAULT, capture_groups capture = capture_groups::EXTRACT); + regex_program() = delete; + /** * @brief Move constructor * @@ -115,8 +117,6 @@ struct regex_program { ~regex_program(); private: - regex_program() = delete; - std::string _pattern; regex_flags _flags; capture_groups _capture; diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 14695c3bb27..34ed3c5618e 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -99,7 +99,7 @@ __device__ inline std::pair bytes_to_character_position(st * values. Also, this char pointer serves as valid device pointer of identity * value for minimum operator on string values. */ -static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; +static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; // NOLINT } // namespace detail } // namespace strings @@ -283,14 +283,11 @@ __device__ inline size_type string_view::const_iterator::position() const { retu __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; } -__device__ inline string_view::const_iterator string_view::begin() const -{ - return const_iterator(*this, 0, 0); -} +__device__ inline string_view::const_iterator string_view::begin() const { return {*this, 0, 0}; } __device__ inline string_view::const_iterator string_view::end() const { - return const_iterator(*this, length(), size_bytes()); + return {*this, length(), size_bytes()}; } // @endcond @@ -411,7 +408,7 @@ __device__ inline size_type string_view::find(char const* str, __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const { - char str[sizeof(char_utf8)]; + char str[sizeof(char_utf8)]; // NOLINT size_type chwidth = strings::detail::from_char_utf8(chr, str); return find(str, chwidth, pos, count); } @@ -433,7 +430,7 @@ __device__ inline size_type string_view::rfind(char const* str, __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const { - char str[sizeof(char_utf8)]; + char str[sizeof(char_utf8)]; // NOLINT size_type chwidth = strings::detail::from_char_utf8(chr, str); return rfind(str, chwidth, pos, count); } diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 4a2512eb7c5..6ec8d1238d6 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -45,7 +45,7 @@ class strings_column_view : private column_view { strings_column_view(column_view strings_column); strings_column_view(strings_column_view&&) = default; ///< Move constructor strings_column_view(strings_column_view const&) = default; ///< Copy constructor - ~strings_column_view() = default; + ~strings_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp index 19798f51656..91d7ddce955 100644 --- a/cpp/include/cudf/structs/structs_column_view.hpp +++ b/cpp/include/cudf/structs/structs_column_view.hpp @@ -42,7 +42,7 @@ class structs_column_view : public column_view { // Foundation members: structs_column_view(structs_column_view const&) = default; ///< Copy constructor structs_column_view(structs_column_view&&) = default; ///< Move constructor - ~structs_column_view() = default; + ~structs_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index 762131a174f..15fdad21d9f 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -148,7 +148,7 @@ class table { std::vector columns(std::distance(begin, end)); std::transform( begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); }); - return table_view(columns); + return table_view{columns}; } /** diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index 4a990f67ce4..d41176590ea 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -241,7 +241,7 @@ class table_view : public detail::table_view_base { { std::vector columns(std::distance(begin, end)); std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); }); - return table_view(columns); + return table_view{columns}; } /** diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp index 2f19efa5630..da4954b859c 100644 --- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp +++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp @@ -59,7 +59,7 @@ class tdigest_column_view : private column_view { tdigest_column_view(column_view const&); ///< Construct tdigest_column_view from a column_view tdigest_column_view(tdigest_column_view&&) = default; ///< Move constructor tdigest_column_view(tdigest_column_view const&) = default; ///< Copy constructor - ~tdigest_column_view() = default; + ~tdigest_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/transpose.hpp b/cpp/include/cudf/transpose.hpp index 8b680071e71..a32491bc7bd 100644 --- a/cpp/include/cudf/transpose.hpp +++ b/cpp/include/cudf/transpose.hpp @@ -36,14 +36,16 @@ namespace CUDF_EXPORT cudf { * @throw cudf::logic_error if column types are non-homogeneous * @throw cudf::logic_error if column types are non-fixed-width * - * @param[in] input A table (M cols x N rows) to be transposed - * @param[in] mr Device memory resource used to allocate the device memory of returned value - * @return The transposed input (N cols x M rows) as a `column` and - * `table_view`, representing the owner and transposed table, - * respectively. + * @param[in] input A table (M cols x N rows) to be transposed + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param[in] mr Device memory resource used to allocate the device memory of returned value + * @return The transposed input (N cols x M rows) as a `column` and + * `table_view`, representing the owner and transposed table, + * respectively. */ std::pair, table_view> transpose( table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index 53e0f3a15d2..046e9745a71 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -16,8 +16,8 @@ #pragma once +#include #include -#include #include #include #include diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp index 97a42243250..3e740b81cc9 100644 --- a/cpp/include/cudf/utilities/default_stream.hpp +++ b/cpp/include/cudf/utilities/default_stream.hpp @@ -16,10 +16,8 @@ #pragma once -#include #include -#include #include namespace CUDF_EXPORT cudf { diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp index 45d5d1b12e1..982554a23f5 100644 --- a/cpp/include/cudf/utilities/logger.hpp +++ b/cpp/include/cudf/utilities/logger.hpp @@ -22,6 +22,10 @@ namespace CUDF_EXPORT cudf { +namespace detail { +spdlog::logger& logger(); +} + /** * @brief Returns the global logger. * @@ -43,6 +47,8 @@ namespace CUDF_EXPORT cudf { * * @return spdlog::logger& The logger. */ -spdlog::logger& logger(); +[[deprecated( + "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& +logger(); } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 0daebc0dd8d..21ee4fa9e9b 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -180,18 +180,6 @@ class span_base { return Derived(_data + _size - count, count); } - /** - * @brief Obtains a span that is a view over the `count` elements of this span starting at offset - * - * @param offset The offset of the first element in the subspan - * @param count The number of elements in the subspan - * @return A subspan of the sequence, of requested count and offset - */ - [[nodiscard]] constexpr Derived subspan(size_type offset, size_type count) const noexcept - { - return Derived(_data + offset, count); - } - private: pointer _data{nullptr}; size_type _size{0}; @@ -234,28 +222,37 @@ struct host_span : public cudf::detail::span_base::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr host_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } /// Constructor from const container /// @param in The container to construct the span from - template < - typename C, - // Only supported containers of types convertible to T - std::enable_if_t::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr host_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -264,7 +261,7 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + std::enable_if_t>* = nullptr> // NOLINT constexpr host_span(cudf::detail::host_vector& in) : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} { @@ -274,7 +271,7 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + std::enable_if_t>* = nullptr> // NOLINT constexpr host_span(cudf::detail::host_vector const& in) : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} { @@ -285,10 +282,10 @@ struct host_span : public cudf::detail::span_base, + std::is_convertible_v, // NOLINT void>* = nullptr> constexpr host_span(host_span const& other) noexcept - : base(other.data(), other.size()) + : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()} { } @@ -299,6 +296,19 @@ struct host_span : public cudf::detail::span_basedata() + offset, count, _is_device_accessible}; + } + private: bool _is_device_accessible{false}; }; @@ -333,26 +343,26 @@ struct device_span : public cudf::detail::span_base::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr device_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } /// Constructor from const container /// @param in The container to construct the span from - template < - typename C, - // Only supported containers of types convertible to T - std::enable_if_t::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr device_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -362,12 +372,25 @@ struct device_span : public cudf::detail::span_base, + std::is_convertible_v, // NOLINT void>* = nullptr> constexpr device_span(device_span const& other) noexcept : base(other.data(), other.size()) { } + + /** + * @brief Obtains a span that is a view over the `count` elements of this span starting at offset + * + * @param offset The offset of the first element in the subspan + * @param count The number of elements in the subspan + * @return A subspan of the sequence, of requested count and offset + */ + [[nodiscard]] constexpr device_span subspan(typename base::size_type offset, + typename base::size_type count) const noexcept + { + return device_span{this->data() + offset, count}; + } }; /** @} */ // end of group @@ -386,42 +409,38 @@ class base_2dspan { constexpr base_2dspan() noexcept = default; /** - * @brief Constructor a 2D span + * @brief Constructor from a span and number of elements in each row. * - * @param data Pointer to the data - * @param rows Number of rows + * @param flat_view The flattened 2D span * @param columns Number of columns */ - constexpr base_2dspan(T* data, size_t rows, size_t columns) noexcept - : _data{data}, _size{rows, columns} + constexpr base_2dspan(RowType flat_view, size_t columns) + : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns} { + CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size"); } - /** - * @brief Constructor a 2D span - * - * @param data Pointer to the data - * @param size Size of the 2D span as pair - */ - base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{std::move(size)} {} /** * @brief Returns a pointer to the beginning of the sequence. * * @return A pointer to the first element of the span */ - constexpr auto data() const noexcept { return _data; } + [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); } + /** * @brief Returns the size in the span as pair. * * @return pair representing rows and columns size of the span */ - constexpr auto size() const noexcept { return _size; } + [[nodiscard]] constexpr auto size() const noexcept { return _size; } + /** * @brief Returns the number of elements in the span. * * @return Number of elements in the span */ - constexpr auto count() const noexcept { return size().first * size().second; } + [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); } + /** * @brief Checks if the span is empty. * @@ -429,19 +448,6 @@ class base_2dspan { */ [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; } - /** - * @brief Returns flattened index of the element at the specified 2D position. - * - * @param row The row index - * @param column The column index - * @param size The size of the 2D span as pair - * @return The flattened index of the element at the specified 2D position - */ - static constexpr size_t flatten_index(size_t row, size_t column, size_type size) noexcept - { - return row * size.second + column; - } - /** * @brief Returns a reference to the row-th element of the sequence. * @@ -453,41 +459,7 @@ class base_2dspan { */ constexpr RowType operator[](size_t row) const { - return {this->data() + flatten_index(row, 0, this->size()), this->size().second}; - } - - /** - * @brief Returns a reference to the first element in the span. - * - * Calling front() on an empty span results in undefined behavior. - * - * @return Reference to the first element in the span - */ - [[nodiscard]] constexpr RowType front() const { return (*this)[0]; } - /** - * @brief Returns a reference to the last element in the span. - * - * Calling back() on an empty span results in undefined behavior. - * - * @return Reference to the last element in the span - */ - [[nodiscard]] constexpr RowType back() const - { - return (*this)[size().first - 1]; - } - - /** - * @brief Obtains a 2D span that is a view over the `num_rows` rows of this span starting at - * `first_row` - * - * @param first_row The first row in the subspan - * @param num_rows The number of rows in the subspan - * @return A subspan of the sequence, of requested starting `first_row` and `num_rows` - */ - constexpr base_2dspan subspan(size_t first_row, size_t num_rows) const noexcept - { - return base_2dspan( - _data + flatten_index(first_row, 0, this->size()), num_rows, this->size().second); + return _flat.subspan(row * _size.second, _size.second); } /** @@ -495,10 +467,7 @@ class base_2dspan { * * @return A flattened span of the 2D span */ - constexpr RowType flat_view() - { - return {this->data(), this->size().first * this->size().second}; - } + [[nodiscard]] constexpr RowType flat_view() const { return _flat; } /** * @brief Construct a 2D span from another 2D span of convertible type @@ -514,13 +483,13 @@ class base_2dspan { RowType>, void>* = nullptr> constexpr base_2dspan(base_2dspan const& other) noexcept - : _data{other.data()}, _size{other.size()} + : _flat{other.flat_view()}, _size{other.size()} { } protected: - T* _data = nullptr; ///< pointer to the first element - size_type _size{0, 0}; ///< rows, columns + RowType _flat; ///< flattened 2D span + size_type _size{0, 0}; ///< num rows, num columns }; /** diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 3f37ae02151..22a67ca049a 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -22,8 +22,6 @@ #include #include -#include - namespace CUDF_EXPORT cudf { /** @@ -303,6 +301,30 @@ constexpr inline bool is_integral_not_bool() */ bool is_integral_not_bool(data_type type); +/** + * @brief Indicates whether the type `T` is a numeric type but not bool type. + * + * @tparam T The type to verify + * @return true `T` is numeric but not bool + * @return false `T` is not numeric or is bool + */ +template +constexpr inline bool is_numeric_not_bool() +{ + return cudf::is_numeric() and not std::is_same_v; +} + +/** + * @brief Indicates whether `type` is a numeric `data_type` but not BOOL8 + * + * "Numeric" types are integral/floating point types such as `INT*` or `FLOAT*`. + * + * @param type The `data_type` to verify + * @return true `type` is numeric but not bool + * @return false `type` is not numeric or is bool + */ +bool is_numeric_not_bool(data_type type); + /** * @brief Indicates whether the type `T` is a floating point type. * diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index 15b5f921c1b..6351a84e38f 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh index 1758790cd64..c259d61060b 100644 --- a/cpp/include/cudf_test/tdigest_utilities.cuh +++ b/cpp/include/cudf_test/tdigest_utilities.cuh @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op) static_cast(values).type(), tdigest_gen{}, op, values, delta); // NOTE: an empty tdigest column still has 1 row. - auto expected = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); } @@ -562,12 +562,12 @@ template void tdigest_merge_empty(MergeFunc merge_op) { // 3 empty tdigests all in the same group - auto a = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto b = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto c = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto b = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); cols.push_back(*b); @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op) auto const delta = 1000; auto result = merge_op(*values, delta); - auto expected = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp index 272c91133f8..2bd08f410e0 100644 --- a/cpp/include/cudf_test/testing_main.hpp +++ b/cpp/include/cudf_test/testing_main.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -36,6 +37,12 @@ namespace CUDF_EXPORT cudf { namespace test { +struct config { + std::string rmm_mode; + std::string stream_mode; + std::string stream_error_mode; +}; + /// MR factory functions inline auto make_cuda() { return std::make_shared(); } @@ -157,10 +164,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv) * @param cmd_opts Command line options returned by parse_cudf_test_opts * @return Memory resource adaptor */ -inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts) +inline auto make_memory_resource_adaptor(cudf::test::config const& config) { - auto const rmm_mode = cmd_opts["rmm_mode"].as(); - auto resource = cudf::test::create_memory_resource(rmm_mode); + auto resource = cudf::test::create_memory_resource(config.rmm_mode); cudf::set_current_device_resource(resource.get()); return resource; } @@ -176,37 +182,54 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts) * @param cmd_opts Command line options returned by parse_cudf_test_opts * @return Memory resource adaptor */ -inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts) +inline auto make_stream_mode_adaptor(cudf::test::config const& config) { auto resource = cudf::get_current_device_resource_ref(); - auto const stream_mode = cmd_opts["stream_mode"].as(); - auto const stream_error_mode = cmd_opts["stream_error_mode"].as(); - auto const error_on_invalid_stream = (stream_error_mode == "error"); - auto const check_default_stream = (stream_mode == "new_cudf_default"); + auto const error_on_invalid_stream = (config.stream_error_mode == "error"); + auto const check_default_stream = (config.stream_mode == "new_cudf_default"); auto adaptor = cudf::test::stream_checking_resource_adaptor( resource, error_on_invalid_stream, check_default_stream); - if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) { + if ((config.stream_mode == "new_cudf_default") || (config.stream_mode == "new_testing_default")) { cudf::set_current_device_resource(&adaptor); } return adaptor; } +/** + * @brief Should be called in every test program that uses rmm allocators since it maintains the + * lifespan of the rmm default memory resource. this function parses the command line to customize + * test behavior, like the allocation mode used for creating the default memory resource. + * + */ +inline void init_cudf_test(int argc, char** argv, cudf::test::config const& config_override = {}) +{ + // static lifetime to keep rmm resource alive till tests end + auto const cmd_opts = parse_cudf_test_opts(argc, argv); + cudf::test::config config = config_override; + if (config.rmm_mode.empty()) { config.rmm_mode = cmd_opts["rmm_mode"].as(); } + + if (config.stream_mode.empty()) { + config.stream_mode = cmd_opts["stream_mode"].as(); + } + + if (config.stream_error_mode.empty()) { + config.stream_error_mode = cmd_opts["stream_error_mode"].as(); + } + + [[maybe_unused]] static auto mr = make_memory_resource_adaptor(config); + [[maybe_unused]] static auto adaptor = make_stream_mode_adaptor(config); +} + /** * @brief Macro that defines main function for gtest programs that use rmm * - * Should be included in every test program that uses rmm allocators since - * it maintains the lifespan of the rmm default memory resource. * This `main` function is a wrapper around the google test generated `main`, - * maintaining the original functionality. In addition, this custom `main` - * function parses the command line to customize test behavior, like the - * allocation mode used for creating the default memory resource. + * maintaining the original functionality. */ -#define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ - [[maybe_unused]] auto mr = make_memory_resource_adaptor(cmd_opts); \ - [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts); \ - return RUN_ALL_TESTS(); \ +#define CUDF_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + init_cudf_test(argc, argv); \ + return RUN_ALL_TESTS(); \ } diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp index 1793a8ecce0..3c96c59f0b7 100644 --- a/cpp/include/cudf_test/type_list_utilities.hpp +++ b/cpp/include/cudf_test/type_list_utilities.hpp @@ -414,9 +414,8 @@ struct RemoveIfImpl> { template struct RemoveIfImpl> { - using type = - Concat::value, Types<>, Types>::type, - typename RemoveIfImpl>::type>; + using type = Concat::value, Types<>, Types>, + typename RemoveIfImpl>::type>; }; // @endcond diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp index 723ba310a1e..dca590baebf 100644 --- a/cpp/include/nvtext/edit_distance.hpp +++ b/cpp/include/nvtext/edit_distance.hpp @@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext { * @param targets Strings to compute edit distance against `input` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New lists column of edit distance values */ std::unique_ptr edit_distance( cudf::strings_column_view const& input, diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 7c909f1a948..b2c1a23f57e 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext { * * This function uses MurmurHash3_x86_32 for the hash algorithm. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if the width < 2 * * @param input Strings column to compute minhash @@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext { * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ -std::unique_ptr minhash( +[[deprecated]] std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, @@ -71,6 +73,8 @@ std::unique_ptr minhash( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 - to be replaced in a future release + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit @@ -83,13 +87,60 @@ std::unique_ptr minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash( +[[deprecated]] std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint32 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash_permuted( + cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash value for each string * @@ -102,6 +153,8 @@ std::unique_ptr minhash( * The hash function returns 2 uint64 values but only the first value * is used with the minhash calculation. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if the width < 2 * * @param input Strings column to compute minhash @@ -112,7 +165,7 @@ std::unique_ptr minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values as UINT64 for each string in input */ -std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, @@ -132,6 +185,8 @@ std::unique_ptr minhash64( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 - to be replaced in a future release + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit @@ -144,13 +199,60 @@ std::unique_ptr minhash64( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint64 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash values for each row of strings per seed * @@ -164,6 +266,8 @@ std::unique_ptr minhash64( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * @@ -173,7 +277,7 @@ std::unique_ptr minhash64( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr word_minhash( +[[deprecated]] std::unique_ptr word_minhash( cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -193,6 +297,8 @@ std::unique_ptr word_minhash( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * @@ -202,7 +308,7 @@ std::unique_ptr word_minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr word_minhash64( +[[deprecated]] std::unique_ptr word_minhash64( cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream = cudf::get_default_stream(), diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp index bbd0503379b..822edcbdb43 100644 --- a/cpp/include/nvtext/replace.hpp +++ b/cpp/include/nvtext/replace.hpp @@ -82,7 +82,7 @@ namespace CUDF_EXPORT nvtext { * The default of empty string will identify tokens using whitespace. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New strings column with replaced strings */ std::unique_ptr replace_tokens( cudf::strings_column_view const& input, @@ -131,7 +131,7 @@ std::unique_ptr replace_tokens( * The default of empty string will identify tokens using whitespace. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New strings column of filtered strings */ std::unique_ptr filter_tokens( cudf::strings_column_view const& input, diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp index 55a4124bfd0..e5b2a4cc21b 100644 --- a/cpp/include/nvtext/stemmer.hpp +++ b/cpp/include/nvtext/stemmer.hpp @@ -51,7 +51,7 @@ enum class letter_type { * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * b1 = is_letter(st, VOWEL, 1) * b1 is now [false, true, true] * @endcode @@ -62,7 +62,7 @@ enum class letter_type { * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string * b2 is now [false, true, false] * @endcode @@ -99,7 +99,7 @@ std::unique_ptr is_letter( * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * ix = [3, 1, 4] * b1 = is_letter(st, VOWEL, ix) * b1 is now [true, true, false] @@ -111,7 +111,7 @@ std::unique_ptr is_letter( * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * ix = [3, -2, 4] // 2nd to last character in st[1] is checked * b2 = is_letter(st, CONSONANT, ix) * b2 is now [false, false, true] diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index c4210699975..4d06aa5d4bc 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -62,11 +62,13 @@ struct hashed_vocabulary { * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file. * Note that this is the file AFTER python/perfect_hash.py has been used * for preprocessing. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Memory resource to allocate any returned objects. * @return vocabulary hash-table elements */ std::unique_ptr load_vocabulary_file( std::string const& filename_hashed_vocabulary, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -147,6 +149,7 @@ struct tokenizer_result { * @param do_truncate If true, the tokenizer will discard all the token-ids after * `max_sequence_length` for each input string. If false, it will use a new row * in the output token-ids to continue generating the output. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Memory resource to allocate any returned objects. * @return token-ids, attention-mask, and metadata */ @@ -157,6 +160,7 @@ tokenizer_result subword_tokenize( uint32_t stride, bool do_lower_case, bool do_truncate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp index e61601c6fea..e345587f88b 100644 --- a/cpp/include/nvtext/tokenize.hpp +++ b/cpp/include/nvtext/tokenize.hpp @@ -292,7 +292,7 @@ std::unique_ptr load_vocabulary( * @throw cudf::logic_error if `delimiter` is invalid * * @param input Strings column to tokenize - * @param vocabulary Used to lookup tokens within + * @param vocabulary Used to lookup tokens within `input` * @param delimiter Used to identify tokens within `input` * @param default_id The token id to be used for tokens not found in the `vocabulary`; * Default is -1 diff --git a/cpp/scripts/parse_iwyu_output.py b/cpp/scripts/parse_iwyu_output.py new file mode 100644 index 00000000000..822a980a1a8 --- /dev/null +++ b/cpp/scripts/parse_iwyu_output.py @@ -0,0 +1,170 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Helper script to modify IWYU output to only include removals. + +Lines that are not from include-what-you-use are removed from the output. +""" + +import argparse +import re +from enum import Enum + + +class Mode(Enum): + NORMAL = 0 + ADD = 1 + REMOVE = 2 + FULL_INCLUDE_LIST = 3 + + +def extract_include_file(include_line): + """Extract the core file path from an #include directive.""" + match = re.search(r'#include\s+[<"]([^">]+)[">]', include_line) + if match: + return match.group(1) + return None + + +def parse_output(input_stream): + include_modifications = {} + current_file = None + mode = Mode.NORMAL + + for line in input_stream: + if match := re.match(r"(\/\S+) should add these lines:", line): + current_file = match.group(1) + include_modifications.setdefault( + current_file, + { + "add_includes": [], + "remove_includes": [], + "full_include_list": [], + }, + ) + mode = Mode.ADD + elif match := re.match(r"(\/\S+) should remove these lines:", line): + mode = Mode.REMOVE + elif match := re.match(r"The full include-list for (\/\S+):", line): + mode = Mode.FULL_INCLUDE_LIST + elif line.strip() == "---": + current_file = None + mode = Mode.NORMAL + else: + if current_file: + if mode == Mode.ADD: + include_modifications[current_file]["add_includes"].append( + line.strip() + ) + elif mode == Mode.REMOVE: + include_modifications[current_file][ + "remove_includes" + ].append(line.strip()) + elif mode == Mode.FULL_INCLUDE_LIST: + include_modifications[current_file][ + "full_include_list" + ].append(line.strip()) + else: + if ( + line.strip() + and "include-what-you-use reported diagnostics" not in line + and "In file included from" not in line + and "has correct #includes/fwd-decls" not in line + ): + print(line, end="") + + return include_modifications + + +def post_process_includes(include_modifications): + """Deduplicate and remove redundant entries from add and remove includes.""" + for mods in include_modifications.values(): + # Deduplicate add_includes and remove_includes + mods["add_includes"] = list(set(mods["add_includes"])) + mods["remove_includes"] = list(set(mods["remove_includes"])) + + # Extract file paths from add_includes and remove_includes + add_files = { + extract_include_file(line) for line in mods["add_includes"] + } + remove_files = { + extract_include_file(line) for line in mods["remove_includes"] + } + + # Remove entries that exist in both add_includes and remove_includes + common_files = add_files & remove_files + mods["add_includes"] = [ + line + for line in mods["add_includes"] + if extract_include_file(line) not in common_files + ] + mods["remove_includes"] = [ + line + for line in mods["remove_includes"] + if extract_include_file(line) not in common_files + ] + + # Remove entries that exist in add_includes from full_include_list + mods["full_include_list"] = [ + include + for include in mods["full_include_list"] + if extract_include_file(include) not in add_files + ] + + +def write_output(include_modifications, output_stream): + for filename, mods in include_modifications.items(): + if mods["remove_includes"]: + # IWYU requires all sections to exist, so we write out this header even + # though we never write out any actual additions. + output_stream.write(f"{filename} should add these lines:\n\n") + + output_stream.write(f"{filename} should remove these lines:\n") + for line in mods["remove_includes"]: + output_stream.write(line + "\n") + output_stream.write("\n") + + output_stream.write(f"The full include-list for {filename}:\n") + for line in mods["full_include_list"]: + output_stream.write(line + "\n") + output_stream.write("---\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Process include modifications from a build output log." + ) + parser.add_argument( + "input", + nargs="?", + type=argparse.FileType("r"), + default="-", + help="Input file to read (default: stdin)", + ) + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default="iwyu_results.txt", + help="Output file to write (default: iwyu_output.txt)", + ) + args = parser.parse_args() + + include_modifications = parse_output(args.input) + post_process_includes(include_modifications) + write_output(include_modifications, args.output) + + +if __name__ == "__main__": + main() diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py deleted file mode 100644 index e5e57dbf562..00000000000 --- a/cpp/scripts/run-clang-tidy.py +++ /dev/null @@ -1,253 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import re -import os -import subprocess -import argparse -import json -import multiprocessing as mp -import shutil - - -EXPECTED_VERSION = "16.0.6" -VERSION_REGEX = re.compile(r" LLVM version ([0-9.]+)") -GPU_ARCH_REGEX = re.compile(r"sm_(\d+)") -SPACES = re.compile(r"\s+") -SEPARATOR = "-" * 16 - - -def parse_args(): - argparser = argparse.ArgumentParser("Runs clang-tidy on a project") - argparser.add_argument("-cdb", type=str, - # TODO This is a hack, needs to be fixed - default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json", - help="Path to cmake-generated compilation database" - " file. It is always found inside the root of the " - "cmake build folder. So make sure that `cmake` has " - "been run once before running this script!") - argparser.add_argument("-exe", type=str, default="clang-tidy", - help="Path to clang-tidy exe") - argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/", - help="Regex used to ignore files from checking") - argparser.add_argument("-select", type=str, default=None, - help="Regex used to select files for checking") - argparser.add_argument("-j", type=int, default=-1, - help="Number of parallel jobs to launch.") - args = argparser.parse_args() - if args.j <= 0: - args.j = mp.cpu_count() - args.ignore_compiled = re.compile(args.ignore) if args.ignore else None - args.select_compiled = re.compile(args.select) if args.select else None - ret = subprocess.check_output("%s --version" % args.exe, shell=True) - ret = ret.decode("utf-8") - version = VERSION_REGEX.search(ret) - if version is None: - raise Exception("Failed to figure out clang-tidy version!") - version = version.group(1) - if version != EXPECTED_VERSION: - raise Exception("clang-tidy exe must be v%s found '%s'" % \ - (EXPECTED_VERSION, version)) - if not os.path.exists(args.cdb): - raise Exception("Compilation database '%s' missing" % args.cdb) - return args - - -def get_all_commands(cdb): - with open(cdb) as fp: - return json.load(fp) - - -def get_gpu_archs(command): - archs = [] - for loc in range(len(command)): - if command[loc] != "-gencode": - continue - arch_flag = command[loc + 1] - match = GPU_ARCH_REGEX.search(arch_flag) - if match is not None: - archs.append("--cuda-gpu-arch=sm_%s" % match.group(1)) - return archs - - -def get_index(arr, item): - try: - return arr.index(item) - except: - return -1 - - -def remove_item(arr, item): - loc = get_index(arr, item) - if loc >= 0: - del arr[loc] - return loc - - -def remove_item_plus_one(arr, item): - loc = get_index(arr, item) - if loc >= 0: - del arr[loc + 1] - del arr[loc] - return loc - - -def get_clang_includes(exe): - dir = os.getenv("CONDA_PREFIX") - if dir is None: - ret = subprocess.check_output("which %s 2>&1" % exe, shell=True) - ret = ret.decode("utf-8") - dir = os.path.dirname(os.path.dirname(ret)) - header = os.path.join(dir, "include", "ClangHeaders") - return ["-I", header] - - -def get_tidy_args(cmd, exe): - command, file = cmd["command"], cmd["file"] - is_cuda = file.endswith(".cu") - command = re.split(SPACES, command) - # compiler is always clang++! - command[0] = "clang++" - # remove compilation and output targets from the original command - remove_item_plus_one(command, "-c") - remove_item_plus_one(command, "-o") - if is_cuda: - # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..." - archs = get_gpu_archs(command) - command.extend(archs) - while True: - loc = remove_item_plus_one(command, "-gencode") - if loc < 0: - break - # "-x cuda" is the right usage in clang - loc = get_index(command, "-x") - if loc >= 0: - command[loc + 1] = "cuda" - remove_item_plus_one(command, "-ccbin") - remove_item(command, "--expt-extended-lambda") - remove_item(command, "--diag_suppress=unrecognized_gcc_pragma") - command.extend(get_clang_includes(exe)) - return command, is_cuda - - -def run_clang_tidy_command(tidy_cmd): - cmd = " ".join(tidy_cmd) - result = subprocess.run(cmd, check=False, shell=True, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - status = result.returncode == 0 - if status: - out = "" - else: - out = "CMD: " + cmd - out += result.stdout.decode("utf-8").rstrip() - return status, out - - -def run_clang_tidy(cmd, args): - command, is_cuda = get_tidy_args(cmd, args.exe) - tidy_cmd = [args.exe, - "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'", - cmd["file"], "--", ] - tidy_cmd.extend(command) - status = True - out = "" - if is_cuda: - tidy_cmd.append("--cuda-device-only") - tidy_cmd.append(cmd["file"]) - ret, out1 = run_clang_tidy_command(tidy_cmd) - out += out1 - out += "%s" % SEPARATOR - if not ret: - status = ret - tidy_cmd[-2] = "--cuda-host-only" - ret, out1 = run_clang_tidy_command(tidy_cmd) - if not ret: - status = ret - out += out1 - else: - tidy_cmd.append(cmd["file"]) - ret, out1 = run_clang_tidy_command(tidy_cmd) - if not ret: - status = ret - out += out1 - return status, out, cmd["file"] - - -# yikes! global var :( -results = [] -def collect_result(result): - global results - results.append(result) - - -def print_result(passed, stdout, file): - status_str = "PASSED" if passed else "FAILED" - print(f"{SEPARATOR} File:{file} {status_str} {SEPARATOR}") - if stdout: - print(stdout) - print(f"{SEPARATOR} File:{file} ENDS {SEPARATOR}") - - -def print_results(): - global results - status = True - for passed, stdout, file in results: - print_result(passed, stdout, file) - if not passed: - status = False - return status - - -def run_tidy_for_all_files(args, all_files): - pool = None if args.j == 1 else mp.Pool(args.j) - # actual tidy checker - for cmd in all_files: - # skip files that we don't want to look at - if args.ignore_compiled is not None and \ - re.search(args.ignore_compiled, cmd["file"]) is not None: - continue - if args.select_compiled is not None and \ - re.search(args.select_compiled, cmd["file"]) is None: - continue - if pool is not None: - pool.apply_async(run_clang_tidy, args=(cmd, args), - callback=collect_result) - else: - passed, stdout, file = run_clang_tidy(cmd, args) - collect_result((passed, stdout, file)) - if pool is not None: - pool.close() - pool.join() - return print_results() - - -def main(): - args = parse_args() - # Attempt to making sure that we run this script from root of repo always - if not os.path.exists(".git"): - raise Exception("This needs to always be run from the root of repo") - # Check whether clang-tidy exists - # print(args) - if "exe" not in args and shutil.which("clang-tidy") is not None: - print("clang-tidy not found. Exiting...") - return - all_files = get_all_commands(args.cdb) - status = run_tidy_for_all_files(args, all_files) - if not status: - raise Exception("clang-tidy failed! Refer to the errors above.") - - -if __name__ == "__main__": - main() diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index 02998b84ffd..d915c85bf85 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,13 @@ */ #include +#include +#include #include +#include + namespace cudf { namespace detail { void initialize_with_identity(mutable_table_view& table, diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp index 3b650d791aa..d0e4c59ca54 100644 --- a/cpp/src/ast/expression_parser.cpp +++ b/cpp/src/ast/expression_parser.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -210,7 +207,7 @@ cudf::data_type expression_parser::output_type() const } std::vector expression_parser::visit_operands( - std::vector> operands) + cudf::host_span const> operands) { auto operand_data_reference_indices = std::vector(); for (auto const& operand : operands) { diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp index b45b9d0c78c..b7e4e4609cb 100644 --- a/cpp/src/ast/expressions.cpp +++ b/cpp/src/ast/expressions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,42 +17,44 @@ #include #include #include -#include -#include -#include #include #include +#include + namespace cudf { namespace ast { -operation::operation(ast_operator op, expression const& input) : op(op), operands({input}) +operation::operation(ast_operator op, expression const& input) : op{op}, operands{input} { - if (cudf::ast::detail::ast_operator_arity(op) != 1) { - CUDF_FAIL("The provided operator is not a unary operator."); - } + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 1, + "The provided operator is not a unary operator.", + std::invalid_argument); } operation::operation(ast_operator op, expression const& left, expression const& right) - : op(op), operands({left, right}) + : op{op}, operands{left, right} { - if (cudf::ast::detail::ast_operator_arity(op) != 2) { - CUDF_FAIL("The provided operator is not a binary operator."); - } + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, + "The provided operator is not a binary operator.", + std::invalid_argument); } cudf::size_type literal::accept(detail::expression_parser& visitor) const { return visitor.visit(*this); } + cudf::size_type column_reference::accept(detail::expression_parser& visitor) const { return visitor.visit(*this); } + cudf::size_type operation::accept(detail::expression_parser& visitor) const { return visitor.visit(*this); } + cudf::size_type column_name_reference::accept(detail::expression_parser& visitor) const { return visitor.visit(*this); @@ -63,16 +65,19 @@ auto literal::accept(detail::expression_transformer& visitor) const { return visitor.visit(*this); } + auto column_reference::accept(detail::expression_transformer& visitor) const -> decltype(visitor.visit(*this)) { return visitor.visit(*this); } + auto operation::accept(detail::expression_transformer& visitor) const -> decltype(visitor.visit(*this)) { return visitor.visit(*this); } + auto column_name_reference::accept(detail::expression_transformer& visitor) const -> decltype(visitor.visit(*this)) { diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index a6c878efbbc..1b23ea12a5e 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -27,15 +27,10 @@ #include #include #include -#include #include -#include #include #include -#include -#include #include -#include #include #include diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index c6af0c3c58a..06987139188 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -253,16 +253,11 @@ struct binary_op_double_device_dispatcher { template CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; + auto start = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); #pragma unroll - for (cudf::size_type i = start; i < size; i += step) { + for (auto i = start; i < size; i += stride) { f(i); } } @@ -282,9 +277,9 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) int min_grid_size; CUDF_CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel)); - // 2 elements per thread. - int const grid_size = util::div_rounding_up_safe(size, 2 * block_size); - for_each_kernel<<>>(size, std::forward(f)); + auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */); + for_each_kernel<<>>( + size, std::forward(f)); } template diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 4ca05f9c335..e6659f76c7c 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -32,7 +33,6 @@ #include #include -#include #include #include @@ -329,7 +329,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask, cudf::detail::grid_1d grid(num_words, block_size); - rmm::device_scalar non_zero_count(0, stream); + cudf::detail::device_scalar non_zero_count(0, stream); count_set_bits_kernel <<>>( diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 482413d0ccb..972f97e8668 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -15,19 +15,13 @@ */ #include -#include #include #include #include -#include #include -#include #include -#include #include -#include - namespace cudf { namespace { struct size_of_helper { diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 386c5ebe478..e831aa9645d 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -27,9 +26,7 @@ #include #include -#include #include -#include #include namespace cudf { diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b8e140f1fa5..d8419760120 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,7 @@ size_type concatenate_masks(device_span d_views, size_type output_size, rmm::cuda_stream_view stream) { - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); constexpr size_type block_size{256}; cudf::detail::grid_1d config(output_size, block_size); concatenate_masks_kernel @@ -265,7 +266,7 @@ std::unique_ptr fused_concatenate(host_span views, auto out_view = out_col->mutable_view(); auto d_out_view = mutable_column_device_view::create(out_view, stream); - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); // Launch kernel constexpr size_type block_size{256}; diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index d60fb5ce110..5e2065ba844 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -20,16 +20,11 @@ #include #include #include -#include #include -#include -#include #include #include -#include - #include namespace cudf { diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu index 29a28f81d1a..80b0bd5242f 100644 --- a/cpp/src/copying/get_element.cu +++ b/cpp/src/copying/get_element.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -71,7 +72,7 @@ struct get_element_functor { auto device_col = column_device_view::create(input, stream); rmm::device_scalar temp_data(stream, mr); - rmm::device_scalar temp_valid(stream, mr); + cudf::detail::device_scalar temp_valid(stream, mr); device_single_thread( [buffer = temp_data.data(), @@ -155,8 +156,8 @@ struct get_element_functor { auto device_col = column_device_view::create(input, stream); - rmm::device_scalar temp_data(stream, mr); - rmm::device_scalar temp_valid(stream, mr); + cudf::detail::device_scalar temp_data(stream, mr); + cudf::detail::device_scalar temp_valid(stream, mr); device_single_thread( [buffer = temp_data.data(), diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 1282eec6c44..a001807c82b 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp index 832a72ed5b0..116e3516460 100644 --- a/cpp/src/copying/split.cpp +++ b/cpp/src/copying/split.cpp @@ -14,10 +14,8 @@ * limitations under the License. */ -#include #include #include -#include #include #include diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ddb0dbcd96d..a497cedb3bc 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -44,19 +44,6 @@ namespace cudf { namespace datetime { namespace detail { -enum class datetime_component { - INVALID = 0, - YEAR, - MONTH, - DAY, - WEEKDAY, - HOUR, - MINUTE, - SECOND, - MILLISECOND, - MICROSECOND, - NANOSECOND -}; enum class rounding_function { CEIL, ///< Rounds up to the next integer multiple of the provided frequency @@ -453,90 +440,70 @@ std::unique_ptr extract_year(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::YEAR, stream, mr); } std::unique_ptr extract_month(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MONTH, stream, mr); } std::unique_ptr extract_day(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::DAY, stream, mr); } std::unique_ptr extract_weekday(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::WEEKDAY, stream, mr); } std::unique_ptr extract_hour(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::HOUR, stream, mr); } std::unique_ptr extract_minute(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MINUTE, stream, mr); } std::unique_ptr extract_second(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::SECOND, stream, mr); } std::unique_ptr extract_millisecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MILLISECOND, stream, mr); } std::unique_ptr extract_microsecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MICROSECOND, stream, mr); } std::unique_ptr extract_nanosecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::NANOSECOND, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, @@ -576,6 +543,32 @@ std::unique_ptr extract_quarter(column_view const& column, return apply_datetime_op(column, stream, mr); } +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ +#define extract(field) \ + case field: \ + return apply_datetime_op, cudf::type_id::INT16>( \ + column, stream, mr) + + switch (component) { + extract(datetime_component::YEAR); + extract(datetime_component::MONTH); + extract(datetime_component::DAY); + extract(datetime_component::WEEKDAY); + extract(datetime_component::HOUR); + extract(datetime_component::MINUTE); + extract(datetime_component::SECOND); + extract(datetime_component::MILLISECOND); + extract(datetime_component::MICROSECOND); + extract(datetime_component::NANOSECOND); + default: CUDF_FAIL("Unsupported datetime component."); + } +#undef extract +} + } // namespace detail std::unique_ptr ceil_datetimes(column_view const& column, @@ -661,6 +654,15 @@ std::unique_ptr extract_second(column_view const& column, return detail::extract_second(column, stream, mr); } +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_datetime_component(column, component, stream, mr); +} + std::unique_ptr extract_millisecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index cf239297255..f786624680c 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -13,12 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include #include #include -#include #include #include @@ -38,7 +36,7 @@ std::string const tzif_system_directory = "/usr/share/zoneinfo/"; struct timezone_file_header { uint32_t magic; ///< "TZif" uint8_t version; ///< 0:version1, '2':version2, '3':version3 - uint8_t reserved15[15]; ///< unused, reserved for future use + uint8_t reserved15[15]; ///< unused, reserved for future use // NOLINT uint32_t isutccnt; ///< number of UTC/local indicators contained in the body uint32_t isstdcnt; ///< number of standard/wall indicators contained in the body uint32_t leapcnt; ///< number of leap second records contained in the body @@ -138,7 +136,7 @@ struct timezone_file { std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name; std::ifstream fin; fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate); - CUDF_EXPECTS(fin, "Failed to open the timezone file."); + CUDF_EXPECTS(fin, "Failed to open the timezone file '" + tz_filename.string() + "'"); auto const file_size = fin.tellg(); fin.seekg(0); diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp index 4906e5b4f9c..3e4a201bba4 100644 --- a/cpp/src/dictionary/dictionary_column_view.cpp +++ b/cpp/src/dictionary/dictionary_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,8 +36,7 @@ column_view dictionary_column_view::indices() const noexcept { return child(0); column_view dictionary_column_view::get_indices_annotated() const noexcept { - return column_view( - indices().type(), size(), indices().head(), null_mask(), null_count(), offset()); + return {indices().type(), size(), indices().head(), null_mask(), null_count(), offset()}; } column_view dictionary_column_view::keys() const noexcept { return child(1); } diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index cc0682b68b9..6eb82618e2a 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu new file mode 100644 index 00000000000..cac6c2224f0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + global_set_t& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh new file mode 100644 index 00000000000..e8b29a0e7a8 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_aggregations.hpp" +#include "compute_global_memory_aggs.hpp" +#include "compute_mapping_indices.hpp" +#include "compute_shared_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + // flatten the aggs to a table that can be operated on by aggregate_row + auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + + auto const grid_size = + max_occupancy_grid_size>(num_rows); + auto const available_shmem_size = get_available_shared_memory_size(grid_size); + auto const has_sufficient_shmem = + available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2); + auto const has_dictionary_request = std::any_of( + requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) { + return cudf::is_dictionary(request.values.type()); + }); + auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem; + + // Performs naive global memory aggregations when the workload is not compatible with shared + // memory, such as when aggregating dictionary columns or when there is insufficient dynamic + // shared memory for shared memory aggregations. + if (!is_shared_memory_compatible) { + return compute_global_memory_aggs(num_rows, + skip_rows_with_nulls, + row_bitmask, + flattened_values, + d_agg_kinds.data(), + agg_kinds, + global_set, + aggs, + sparse_results, + stream); + } + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + // 'local_mapping_index' maps from the global row index of the input table to its block-wise rank + rmm::device_uvector local_mapping_index(num_rows, stream); + // 'global_mapping_index' maps from the block-wise rank to the row index of global aggregate table + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + + // Flag indicating whether a global memory aggregation fallback is required or not + rmm::device_scalar needs_global_memory_fallback(stream); + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + compute_mapping_indices(grid_size, + num_rows, + global_set_ref, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + needs_global_memory_fallback.data(), + stream); + + cuda::std::atomic_flag h_needs_fallback; + // Cannot use `device_scalar::value` as it requires a copy constructor, which + // `atomic_flag` doesn't have. + CUDF_CUDA_TRY(cudaMemcpyAsync(&h_needs_fallback, + needs_global_memory_fallback.data(), + sizeof(cuda::std::atomic_flag), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + auto const needs_fallback = h_needs_fallback.test(); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + needs_fallback, + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + + compute_shared_memory_aggs(grid_size, + available_shmem_size, + num_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); + + // The shared memory groupby is designed so that each thread block can handle up to 128 unique + // keys. When a block reaches this cardinality limit, shared memory becomes insufficient to store + // the temporary aggregation results. In these situations, we must fall back to a global memory + // aggregator to process the remaining aggregation requests. + if (needs_fallback) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + global_memory_fallback_fn{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + row_bitmask, + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp new file mode 100644 index 00000000000..829c3c808b0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu new file mode 100644 index 00000000000..1d7184227ea --- /dev/null +++ b/cpp/src/groupby/hash/compute_aggregations_null.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.cuh" +#include "compute_aggregations.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_aggregations( + int64_t num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + nullable_global_set_t& global_set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu new file mode 100644 index 00000000000..6025686953e --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh new file mode 100644 index 00000000000..00db149c6d9 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_global_memory_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + auto constexpr uses_global_memory_aggs = true; + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(num_rows, stream); + + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds, + agg_kinds, + uses_global_memory_aggs, + global_set, + populated_keys, + stream); + + // prepare to launch kernel to do the actual aggregation + auto d_values = table_device_view::create(flattened_values, stream); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + hash::compute_single_pass_aggs_fn{ + global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggregations.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggregations[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp new file mode 100644 index 00000000000..0777b9ffd93 --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + SetType& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu new file mode 100644 index 00000000000..209e2b7f20a --- /dev/null +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_global_memory_aggs.cuh" +#include "compute_global_memory_aggs.hpp" + +namespace cudf::groupby::detail::hash { +template rmm::device_uvector compute_global_memory_aggs( + cudf::size_type num_rows, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector const& agg_kinds, + nullable_global_set_t& global_set, + std::vector>& aggregations, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu new file mode 100644 index 00000000000..e1dbf2a3d9e --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_aggregations.hpp" +#include "compute_groupby.hpp" +#include "helpers.cuh" +#include "sparse_to_dense_results.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +std::unique_ptr
compute_groupby(table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + Equal const& d_row_equal, + Hash const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // convert to int64_t to avoid potential overflow with large `keys` + auto const num_keys = static_cast(keys.num_rows()); + + // Cache of sparse results where the location of aggregate value in each + // column is indexed by the hash set + cudf::detail::result_cache sparse_results(requests.size()); + + auto set = cuco::static_set{ + cuco::extent{num_keys}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_row_equal, + probing_scheme_t{d_row_hash}, + cuco::thread_scope_device, + cuco::storage{}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + + auto row_bitmask = + skip_rows_with_nulls + ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + : rmm::device_buffer{}; + + // Compute all single pass aggs first + auto gather_map = compute_aggregations(num_keys, + skip_rows_with_nulls, + static_cast(row_bitmask.data()), + set, + requests, + &sparse_results, + stream); + + // Compact all results from sparse_results and insert into cache + sparse_to_dense_results(requests, + &sparse_results, + cache, + gather_map, + set.ref(cuco::find), + static_cast(row_bitmask.data()), + stream, + mr); + + return cudf::detail::gather(keys, + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + row_comparator_t const& d_row_equal, + row_hash_t const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + nullable_row_comparator_t const& d_row_equal, + row_hash_t const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp new file mode 100644 index 00000000000..77243dc0a4f --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes groupby using hash table. + * + * First, we create a hash table that stores the indices of unique rows in + * `keys`. The upper limit on the number of values in this map is the number + * of rows in `keys`. + * + * To store the results of aggregations, we create temporary sparse columns + * which have the same size as input value columns. Using the hash map, we + * determine the location within the sparse column to write the result of the + * aggregation into. + * + * The sparse column results of all aggregations are stored into the cache + * `sparse_results`. This enables the use of previously calculated results in + * other aggregations. + * + * All the aggregations which can be computed in a single pass are computed + * first, in a combined kernel. Then using these results, aggregations that + * require multiple passes, will be computed. + * + * Finally, using the hash map, we generate a vector of indices of populated + * values in sparse result columns. Then, for each aggregation originally + * requested in `requests`, we gather sparse results into a column of dense + * results using the aforementioned index vector. Dense results are stored into + * the in/out parameter `cache`. + * + * @tparam Equal Device row comparator type + * @tparam Hash Device row hasher type + * + * @param keys Table whose rows act as the groupby keys + * @param requests The set of columns to aggregate and the aggregations to perform + * @param skip_rows_with_nulls Flag indicating whether to ignore nulls or not + * @param d_row_equal Device row comparator + * @param d_row_hash Device row hasher + * @param cache Dense aggregation results + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table + * @return Table of unique keys + */ +template +std::unique_ptr compute_groupby(table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + Equal const& d_row_equal, + Hash const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu new file mode 100644 index 00000000000..519d7cd2f1c --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_mapping_indices.cuh" +#include "compute_mapping_indices.hpp" + +namespace cudf::groupby::detail::hash { +template cudf::size_type max_occupancy_grid_size>( + cudf::size_type n); + +template void compute_mapping_indices>( + cudf::size_type grid_size, + cudf::size_type num, + hash_set_ref_t global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh new file mode 100644 index 00000000000..d353830780f --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_mapping_indices.hpp" +#include "helpers.cuh" + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +template +__device__ void find_local_mapping(cooperative_groups::thread_block const& block, + cudf::size_type idx, + cudf::size_type num_input_rows, + SetType shared_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* cardinality, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + auto const is_valid_input = + idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)); + auto const [result_idx, inserted] = [&]() { + if (is_valid_input) { + auto const result = shared_set.insert_and_find(idx); + auto const matched_idx = *result.first; + auto const inserted = result.second; + // inserted a new element + if (result.second) { + auto const shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = idx; + local_mapping_index[idx] = shared_set_index; + } + return cuda::std::pair{matched_idx, inserted}; + } + return cuda::std::pair{0, false}; // dummy values + }(); + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + block.sync(); + if (is_valid_input) { + // element was already in set + if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cooperative_groups::thread_block const& block, + cudf::size_type cardinality, + SetRef global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index) +{ + // for all unique keys in shared memory hash set, stores their matches in + // global hash set to `global_mapping_index` + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto const input_idx = shared_set_indices[idx]; + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] = + *global_set.insert_and_find(input_idx).first; + } +} + +/* + * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given + * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds + * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating + * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the + * global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback) +{ + __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; + + // Shared set initialization + __shared__ cuco::window windows[window_extent.value()]; + + auto raw_set = cuco::static_set_ref{ + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + global_set.key_eq(), + probing_scheme_t{global_set.hash_function()}, + cuco::thread_scope_block, + cuco::aow_storage_ref{ + window_extent, windows}}; + auto shared_set = raw_set.rebind_operators(cuco::insert_and_find); + + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + + __shared__ cudf::size_type cardinality; + if (block.thread_rank() == 0) { cardinality = 0; } + block.sync(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + + for (auto idx = cudf::detail::grid_1d::global_thread_id(); + idx - block.thread_rank() < num_input_rows; + idx += stride) { + find_local_mapping(block, + idx, + num_input_rows, + shared_set, + row_bitmask, + skip_rows_with_nulls, + &cardinality, + local_mapping_index, + shared_set_indices); + + block.sync(); + + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block.thread_rank() == 0) { needs_global_memory_fallback->test_and_set(); } + break; + } + } + + // Insert unique keys from shared to global hash set if block-cardinality + // doesn't exceed the threshold upper-limit + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { + find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index); + } + + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } +} + +template +cudf::size_type max_occupancy_grid_size(cudf::size_type n) +{ + cudf::size_type max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, mapping_indices_kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); +} + +template +void compute_mapping_indices(cudf::size_type grid_size, + cudf::size_type num, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream) +{ + mapping_indices_kernel<<>>( + num, + global_set, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + needs_global_memory_fallback); +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp new file mode 100644 index 00000000000..473ad99e650 --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include + +namespace cudf::groupby::detail::hash { +/* + * @brief Computes the maximum number of active blocks of the given kernel that can be executed on + * the underlying device + */ +template +[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n); + +template +void compute_mapping_indices(cudf::size_type grid_size, + cudf::size_type num, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu new file mode 100644 index 00000000000..81c3c9e456f --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_mapping_indices.cuh" +#include "compute_mapping_indices.hpp" + +namespace cudf::groupby::detail::hash { +template cudf::size_type +max_occupancy_grid_size>(cudf::size_type n); + +template void compute_mapping_indices>( + cudf::size_type grid_size, + cudf::size_type num, + nullable_hash_set_ref_t global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu new file mode 100644 index 00000000000..f0361ccced2 --- /dev/null +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_shared_memory_aggs.hpp" +#include "global_memory_aggregator.cuh" +#include "helpers.cuh" +#include "shared_memory_aggregator.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +namespace { +/// Functor used by type dispatcher returning the size of the underlying C++ type +struct size_of_functor { + template + __device__ constexpr cudf::size_type operator()() + { + return sizeof(T); + } +}; + +/// Shared memory data alignment +CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8; + +// Allocates shared memory required for output columns. Exits if there is insufficient memory to +// perform shared memory aggregation for the current output column. +__device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, + cudf::size_type& col_end, + cudf::mutable_table_device_view output_values, + cudf::size_type output_size, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::size_type cardinality, + cudf::size_type total_agg_size) +{ + col_start = col_end; + cudf::size_type bytes_allocated = 0; + + auto const valid_col_size = + cudf::util::round_up_safe(static_cast(sizeof(bool) * cardinality), ALIGNMENT); + + while (bytes_allocated < total_agg_size && col_end < output_size) { + auto const col_idx = col_end; + auto const next_col_size = + cudf::util::round_up_safe(cudf::type_dispatcher( + output_values.column(col_idx).type(), size_of_functor{}) * + cardinality, + ALIGNMENT); + auto const next_col_total_size = next_col_size + valid_col_size; + + if (bytes_allocated + next_col_total_size > total_agg_size) { break; } + + shmem_agg_res_offsets[col_end] = bytes_allocated; + shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; + + bytes_allocated += next_col_total_size; + ++col_end; + } +} + +// Each block initialize its own shared memory aggregation results +__device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block, + cudf::size_type col_start, + cudf::size_type col_end, + cudf::mutable_table_device_view output_values, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::size_type cardinality, + cudf::aggregation::Kind const* d_agg_kinds) +{ + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + auto target_mask = + reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); + cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), + d_agg_kinds[col_idx], + initialize_shmem{}, + target, + target_mask, + idx); + } + } + block.sync(); +} + +__device__ void compute_pre_aggregrations(cudf::size_type col_start, + cudf::size_type col_end, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::table_device_view source, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::aggregation::Kind const* d_agg_kinds) +{ + // Aggregates global memory sources to shared memory targets + for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows; + source_idx += cudf::detail::grid_1d::grid_stride()) { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) { + auto const target_idx = local_mapping_index[source_idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto const source_col = source.column(col_idx); + + cuda::std::byte* target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + bool* target_mask = + reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); + + cudf::detail::dispatch_type_and_aggregation(source_col.type(), + d_agg_kinds[col_idx], + shmem_element_aggregator{}, + target, + target_mask, + target_idx, + source_col, + source_idx); + } + } + } +} + +__device__ void compute_final_aggregations(cooperative_groups::thread_block const& block, + cudf::size_type col_start, + cudf::size_type col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view target, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* agg_res_offsets, + cudf::size_type* agg_mask_offsets, + cudf::aggregation::Kind const* d_agg_kinds) +{ + // Aggregates shared memory sources to global memory targets + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto const target_idx = + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto target_col = target.column(col_idx); + + cuda::std::byte* source = + reinterpret_cast(shmem_agg_storage + agg_res_offsets[col_idx]); + bool* source_mask = reinterpret_cast(shmem_agg_storage + agg_mask_offsets[col_idx]); + + cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), + d_agg_kinds[col_idx], + gmem_element_aggregator{}, + target_col, + target_idx, + input_values.column(col_idx), + source, + source_mask, + idx); + } + } + block.sync(); +} + +/* Takes the local_mapping_index and global_mapping_index to compute + * pre (shared) and final (global) aggregates*/ +CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + cudf::size_type total_agg_size, + cudf::size_type offsets_size) +{ + auto const block = cooperative_groups::this_thread_block(); + auto const cardinality = block_cardinality[block.group_index().x]; + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } + + auto const num_cols = output_values.num_columns(); + + __shared__ cudf::size_type col_start; + __shared__ cudf::size_type col_end; + extern __shared__ cuda::std::byte shmem_agg_storage[]; + + cudf::size_type* shmem_agg_res_offsets = + reinterpret_cast(shmem_agg_storage + total_agg_size); + cudf::size_type* shmem_agg_mask_offsets = + reinterpret_cast(shmem_agg_storage + total_agg_size + offsets_size); + + if (block.thread_rank() == 0) { + col_start = 0; + col_end = 0; + } + block.sync(); + + while (col_end < num_cols) { + if (block.thread_rank() == 0) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_cols, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + cardinality, + total_agg_size); + } + block.sync(); + + initialize_shmem_aggregations(block, + col_start, + col_end, + output_values, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + cardinality, + d_agg_kinds); + + compute_pre_aggregrations(col_start, + col_end, + row_bitmask, + skip_rows_with_nulls, + input_values, + num_rows, + local_mapping_index, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + d_agg_kinds); + block.sync(); + + compute_final_aggregations(block, + col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + d_agg_kinds); + } +} +} // namespace + +std::size_t get_available_shared_memory_size(cudf::size_type grid_size) +{ + auto const active_blocks_per_sm = + cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); + + size_t dynamic_shmem_size = 0; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + return cudf::util::round_down_safe(static_cast(0.5 * dynamic_shmem_size), + ALIGNMENT); +} + +void compute_shared_memory_aggs(cudf::size_type grid_size, + std::size_t available_shmem_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream) +{ + // For each aggregation, need one offset determining where the aggregation is + // performed, another indicating the validity of the aggregation + auto const offsets_size = compute_shmem_offsets_size(output_values.num_columns()); + // The rest of shmem is utilized for the actual arrays in shmem + CUDF_EXPECTS(available_shmem_size > offsets_size * 2, + "No enough space for shared memory aggregations"); + auto const shmem_agg_size = available_shmem_size - offsets_size * 2; + single_pass_shmem_aggs_kernel<<>>( + num_input_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + d_agg_kinds, + shmem_agg_size, + offsets_size); +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp new file mode 100644 index 00000000000..346956cdab0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +std::size_t get_available_shared_memory_size(cudf::size_type grid_size); + +std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols) +{ + return sizeof(cudf::size_type) * num_cols; +} + +void compute_shared_memory_aggs(cudf::size_type grid_size, + std::size_t available_shmem_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu new file mode 100644 index 00000000000..bc32e306b3f --- /dev/null +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "create_sparse_results_table.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + +// make table that will hold sparse results +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + // TODO single allocation - room for performance improvement + std::vector> sparse_columns; + std::transform(flattened_values.begin(), + flattened_values.end(), + agg_kinds.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto const mask_flag = + (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); + } + // Else initialize the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, agg_kinds, stream); + } + return sparse_table; +} + +template void extract_populated_keys( + global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template void extract_populated_keys( + nullable_global_set_t const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +template cudf::table create_sparse_results_table( + cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + nullable_global_set_t const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp new file mode 100644 index 00000000000..8155ce852e0 --- /dev/null +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes and returns a device vector containing all populated keys in + * `key_set`. + * + * @tparam SetType Type of the key hash set + * + * @param key_set Key hash set + * @param populated_keys Array of unique keys + * @param stream CUDA stream used for device memory operations and kernel launches + * @return An array of unique keys contained in `key_set` + */ +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); + +// make table that will hold sparse results +template +cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector agg_kinds, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp new file mode 100644 index 00000000000..b2048a9fbb8 --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flatten_single_pass_aggs.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +class groupby_simple_aggregations_collector final + : public cudf::detail::simple_aggregations_collector { + public: + using cudf::detail::simple_aggregations_collector::visit; + + std::vector> visit(data_type col_type, + cudf::detail::min_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() + : make_min_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::max_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() + : make_max_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::mean_aggregation const&) override + { + (void)col_type; + CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::var_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::std_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit( + data_type, cudf::detail::correlation_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } +}; + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests) +{ + std::vector columns; + std::vector> aggs; + std::vector agg_kinds; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + + std::unordered_set agg_kinds_set; + auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { + if (agg_kinds_set.insert(agg->kind).second) { + agg_kinds.push_back(agg->kind); + aggs.push_back(std::move(agg)); + columns.push_back(request_values); + } + }; + + auto values_type = cudf::is_dictionary(request.values.type()) + ? cudf::dictionary_column_view(request.values).keys().type() + : request.values.type(); + for (auto&& agg : agg_v) { + groupby_simple_aggregations_collector collector; + + for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { + insert_agg(request.values, std::move(agg_s)); + } + } + } + + return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp new file mode 100644 index 00000000000..dfad51f27d4 --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh new file mode 100644 index 00000000000..50e89c727ff --- /dev/null +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +struct update_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view, + cudf::size_type, + cudf::column_device_view, + cuda::std::byte*, + cudf::size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_timestamp()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// The shared memory will already have it squared +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM_OF_SQUARES, + cuda::std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + Target value = static_cast(source_casted[source_index]); + + cudf::detail::atomic_add(&target.element(target_index), value); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::PRODUCT, + cuda::std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and +// non-fixed point column +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_VALID, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_ALL, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_ALL is initialized to be all valid + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmax_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source_column.element(source_argmax_index) > + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmax_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmin_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source_column.element(source_argmin_index) < + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmin_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief A functor that updates a single element in the target column stored in global memory by + * applying an aggregation operation to a corresponding element from a source column in shared + * memory. + * + * This functor can NOT be used for dictionary columns. + * + * This is a redundant copy replicating the behavior of `elementwise_aggregator` from + * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts + * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from + * shared memory. + */ +struct gmem_element_aggregator { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + bool* source_mask, + cudf::size_type source_index) const noexcept + { + // Early exit for all aggregation kinds since shared memory aggregation of + // `COUNT_ALL` is always valid + if (!source_mask[source_index]) { return; } + + update_target_element_gmem{}( + target, target_index, source_column, source, source_index); + } +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index f9a80a048b5..30e1d52fdbf 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,58 +14,32 @@ * limitations under the License. */ +#include "compute_groupby.hpp" #include "groupby/common/utils.hpp" -#include "groupby/hash/groupby_kernels.cuh" +#include "helpers.cuh" #include -#include -#include -#include -#include -#include #include -#include -#include -#include #include -#include -#include -#include +#include #include #include -#include #include #include -#include #include #include -#include #include #include #include -#include -#include -#include - +#include #include -#include #include +#include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { namespace { - -// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested -// types and `cg_size = 1`for flat data to improve performance -using probing_scheme_type = cuco::linear_probing< - 1, ///< Number of threads used to handle each input key - cudf::experimental::row::hash::device_row_hasher>; - /** * @brief List of aggregation operations that can be computed with a hash-based * implementation. @@ -110,517 +84,33 @@ bool constexpr is_hash_aggregation(aggregation::Kind t) return array_contains(hash_aggregations, t); } -class groupby_simple_aggregations_collector final - : public cudf::detail::simple_aggregations_collector { - public: - using cudf::detail::simple_aggregations_collector::visit; - - std::vector> visit(data_type col_type, - cudf::detail::min_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() - : make_min_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::max_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() - : make_max_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::mean_aggregation const&) override - { - (void)col_type; - CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::var_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::std_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit( - data_type, cudf::detail::correlation_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } -}; - -template -class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { - column_view col; - data_type result_type; - cudf::detail::result_cache* sparse_results; - cudf::detail::result_cache* dense_results; - device_span gather_map; - SetType set; - bitmask_type const* __restrict__ row_bitmask; - rmm::cuda_stream_view stream; - rmm::device_async_resource_ref mr; - - public: - using cudf::detail::aggregation_finalizer::visit; - - hash_compound_agg_finalizer(column_view col, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bitmask_type const* row_bitmask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - : col(col), - sparse_results(sparse_results), - dense_results(dense_results), - gather_map(gather_map), - set(set), - row_bitmask(row_bitmask), - stream(stream), - mr(mr) - { - result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - } - - auto to_dense_agg_result(cudf::aggregation const& agg) - { - auto s = sparse_results->get_result(col, agg); - auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(dense_result_table->release()[0]); - } - - // Enables conversion of ARGMIN/ARGMAX into MIN/MAX - auto gather_argminmax(aggregation const& agg) - { - auto arg_result = to_dense_agg_result(agg); - // We make a view of ARG(MIN/MAX) result without a null mask and gather - // using this map. The values in data buffer of ARG(MIN/MAX) result - // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL - // which is an out of bounds index value (-1) and causes the gathered - // value to be null. - column_view null_removed_map( - data_type(type_to_id()), - arg_result->size(), - static_cast(arg_result->view().template data()), - nullptr, - 0); - auto gather_argminmax = - cudf::detail::gather(table_view({col}), - null_removed_map, - arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(gather_argminmax->release()[0]); - } - - // Declare overloads for each kind of aggregation to dispatch - void visit(cudf::aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::min_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmin_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::max_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmax_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::mean_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = dense_results->get_result(col, *sum_agg); - column_view count_result = dense_results->get_result(col, *count_agg); - - auto result = - cudf::detail::binary_operation(sum_result, - count_result, - binary_operator::DIV, - cudf::detail::target_type(result_type, aggregation::MEAN), - stream, - mr); - dense_results->add_result(col, agg, std::move(result)); - } - - void visit(cudf::detail::var_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = sparse_results->get_result(col, *sum_agg); - column_view count_result = sparse_results->get_result(col, *count_agg); - - auto values_view = column_device_view::create(col, stream); - auto sum_view = column_device_view::create(sum_result, stream); - auto count_view = column_device_view::create(count_result, stream); - - auto var_result = make_fixed_width_column( - cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); - auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); - mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col.size(), - ::cudf::detail::var_hash_functor{ - set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); - sparse_results->add_result(col, agg, std::move(var_result)); - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::std_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - auto var_agg = make_variance_aggregation(agg._ddof); - this->visit(*dynamic_cast(var_agg.get())); - column_view variance = dense_results->get_result(col, *var_agg); - - auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); - dense_results->add_result(col, agg, std::move(result)); - } -}; -// flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) +std::unique_ptr
dispatch_groupby(table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool const keys_have_nulls, + null_policy const include_null_keys, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - std::vector columns; - std::vector> aggs; - std::vector agg_kinds; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - - std::unordered_set agg_kinds_set; - auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { - if (agg_kinds_set.insert(agg->kind).second) { - agg_kinds.push_back(agg->kind); - aggs.push_back(std::move(agg)); - columns.push_back(request_values); - } - }; - - auto values_type = cudf::is_dictionary(request.values.type()) - ? cudf::dictionary_column_view(request.values).keys().type() - : request.values.type(); - for (auto&& agg : agg_v) { - groupby_simple_aggregations_collector collector; - - for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { - insert_agg(request.values, std::move(agg_s)); - } - } - } - - return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); -} - -/** - * @brief Gather sparse results into dense using `gather_map` and add to - * `dense_cache` - * - * @see groupby_null_templated() - */ -template -void sparse_to_dense_results(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bool keys_have_nulls, - null_policy include_null_keys, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto row_bitmask = - cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - bitmask_type const* row_bitmask_ptr = - skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - auto const& col = request.values; - - // Given an aggregation, this will get the result from sparse_results and - // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); - for (auto&& agg : agg_v) { - agg->finalize(finalizer); - } - } -} - -// make table that will hold sparse results -auto create_sparse_results_table(table_view const& flattened_values, - std::vector aggs, - rmm::cuda_stream_view stream) -{ - // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - std::transform( - flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - bool nullable = - (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); - auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; - - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); - - table sparse_table(std::move(sparse_columns)); - mutable_table_view table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(table_view, aggs, stream); - return sparse_table; -} - -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -void compute_single_pass_aggs(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - SetType set, - bool keys_have_nulls, - null_policy include_null_keys, - rmm::cuda_stream_view stream) -{ - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - - // make table that will hold sparse results - table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - auto const d_aggs = cudf::detail::make_device_uvector_async( - agg_kinds, stream, cudf::get_current_device_resource_ref()); - auto const skip_key_rows_with_nulls = - keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - - auto row_bitmask = - skip_key_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first - : rmm::device_buffer{}; - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - hash::compute_single_pass_aggs_fn{set, - *d_values, - *d_sparse_table, - d_aggs.data(), - static_cast(row_bitmask.data()), - skip_key_rows_with_nulls}); - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } -} - -/** - * @brief Computes and returns a device vector containing all populated keys in - * `map`. - */ -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream) -{ - rmm::device_uvector populated_keys(num_keys, stream); - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); - return populated_keys; -} - -/** - * @brief Computes groupby using hash table. - * - * First, we create a hash table that stores the indices of unique rows in - * `keys`. The upper limit on the number of values in this map is the number - * of rows in `keys`. - * - * To store the results of aggregations, we create temporary sparse columns - * which have the same size as input value columns. Using the hash map, we - * determine the location within the sparse column to write the result of the - * aggregation into. - * - * The sparse column results of all aggregations are stored into the cache - * `sparse_results`. This enables the use of previously calculated results in - * other aggregations. - * - * All the aggregations which can be computed in a single pass are computed - * first, in a combined kernel. Then using these results, aggregations that - * require multiple passes, will be computed. - * - * Finally, using the hash map, we generate a vector of indices of populated - * values in sparse result columns. Then, for each aggregation originally - * requested in `requests`, we gather sparse results into a column of dense - * results using the aforementioned index vector. Dense results are stored into - * the in/out parameter `cache`. - */ -std::unique_ptr
groupby(table_view const& keys, - host_span requests, - cudf::detail::result_cache* cache, - bool const keys_have_nulls, - null_policy const include_null_keys, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - // convert to int64_t to avoid potential overflow with large `keys` - auto const num_keys = static_cast(keys.num_rows()); - auto const null_keys_are_equal = null_equality::EQUAL; - auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const null_keys_are_equal = null_equality::EQUAL; + auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const skip_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const comparator = cudf::experimental::row::equality::self_comparator{preprocessed_keys}; auto const row_hash = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)}; auto const d_row_hash = row_hash.device_hasher(has_null); - // Cache of sparse results where the location of aggregate value in each - // column is indexed by the hash set - cudf::detail::result_cache sparse_results(requests.size()); - - auto const comparator_helper = [&](auto const d_key_equal) { - auto const set = cuco::static_set{ - num_keys, - 0.5, // desired load factor - cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - d_key_equal, - probing_scheme_type{d_row_hash}, - cuco::thread_scope_device, - cuco::storage<1>{}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - // Compute all single pass aggs first - compute_single_pass_aggs(keys, - requests, - &sparse_results, - set.ref(cuco::insert_and_find), - keys_have_nulls, - include_null_keys, - stream); - - // Extract the populated indices from the hash set and create a gather map. - // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); - - // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(keys, - requests, - &sparse_results, - cache, - gather_map, - set.ref(cuco::find), - keys_have_nulls, - include_null_keys, - stream, - mr); - - return cudf::detail::gather(keys, - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - }; - if (cudf::detail::has_nested_columns(keys)) { - auto const d_key_equal = comparator.equal_to(has_null, null_keys_are_equal); - return comparator_helper(d_key_equal); + auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); + return compute_groupby( + keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr); } else { - auto const d_key_equal = comparator.equal_to(has_null, null_keys_are_equal); - return comparator_helper(d_key_equal); + auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); + return compute_groupby( + keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr); } } - } // namespace /** @@ -661,11 +151,8 @@ std::pair, std::vector> groupby( cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = - groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); + dispatch_groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); } -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh deleted file mode 100644 index 9abfe22950a..00000000000 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "multi_pass_kernels.cuh" - -#include -#include -#include -#include - -#include - -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { -/** - * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, - * and populate `set` with indices of unique keys - * - * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If - * the index was not present in the set, insert they index and then copy it to the output. If the - * key was already present in the set, then the inserted index is aggregated with the existing row. - * This aggregation is done for every element `j` in the row by applying aggregation operation `j` - * between the new and existing element. - * - * Instead of storing the entire rows from `input_keys` and `input_values` in - * the hashset, we instead store the row indices. For example, when inserting - * row at index `i` from `input_keys` into the hash set, the value `i` is what - * gets stored for the hash set's "key". It is assumed the `set` was constructed - * with a custom comparator that uses these row indices to check for equality - * between key rows. For example, comparing two keys `k0` and `k1` will compare - * the two rows `input_keys[k0] ?= input_keys[k1]` - * - * The exact size of the result is not known a priori, but can be upper bounded - * by the number of rows in `input_keys` & `input_values`. Therefore, it is - * assumed `output_values` has sufficient storage for an equivalent number of - * rows. In this way, after all rows are aggregated, `output_values` will likely - * be "sparse", meaning that not all rows contain the result of an aggregation. - * - * @tparam SetType The type of the hash set device ref - */ -template -struct compute_single_pass_aggs_fn { - SetType set; - table_device_view input_values; - mutable_table_device_view output_values; - aggregation::Kind const* __restrict__ aggs; - bitmask_type const* __restrict__ row_bitmask; - bool skip_rows_with_nulls; - - /** - * @brief Construct a new compute_single_pass_aggs_fn functor object - * - * @param set_ref Hash set object to insert key,value pairs into. - * @param input_values The table whose rows will be aggregated in the values - * of the hash set - * @param output_values Table that stores the results of aggregating rows of - * `input_values`. - * @param aggs The set of aggregation operations to perform across the - * columns of the `input_values` rows - * @param row_bitmask Bitmask where bit `i` indicates the presence of a null - * value in row `i` of input keys. Only used if `skip_rows_with_nulls` is `true` - * @param skip_rows_with_nulls Indicates if rows in `input_keys` containing - * null values should be skipped. It `true`, it is assumed `row_bitmask` is a - * bitmask where bit `i` indicates the presence of a null value in row `i`. - */ - compute_single_pass_aggs_fn(SetType set, - table_device_view input_values, - mutable_table_device_view output_values, - aggregation::Kind const* aggs, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls) - : set(set), - input_values(input_values), - output_values(output_values), - aggs(aggs), - row_bitmask(row_bitmask), - skip_rows_with_nulls(skip_rows_with_nulls) - { - } - - __device__ void operator()(size_type i) - { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { - auto const result = set.insert_and_find(i); - - cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); - } - } -}; - -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu new file mode 100644 index 00000000000..37a61c1a22c --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hash_compound_agg_finalizer.hpp" +#include "helpers.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +template +hash_compound_agg_finalizer::hash_compound_agg_finalizer( + column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : col(col), + sparse_results(sparse_results), + dense_results(dense_results), + gather_map(gather_map), + set(set), + row_bitmask(row_bitmask), + stream(stream), + mr(mr) +{ + result_type = + cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type(); +} + +template +auto hash_compound_agg_finalizer::to_dense_agg_result(cudf::aggregation const& agg) +{ + auto s = sparse_results->get_result(col, agg); + auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(dense_result_table->release()[0]); +} + +template +auto hash_compound_agg_finalizer::gather_argminmax(aggregation const& agg) +{ + auto arg_result = to_dense_agg_result(agg); + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. + column_view null_removed_map( + data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data()), + nullptr, + 0); + auto gather_argminmax = + cudf::detail::gather(table_view({col}), + null_removed_map, + arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY + : cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(gather_argminmax->release()[0]); +} + +template +void hash_compound_agg_finalizer::visit(cudf::aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::min_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmin_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::max_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmax_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::mean_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = dense_results->get_result(col, *sum_agg); + column_view count_result = dense_results->get_result(col, *count_agg); + + auto result = + cudf::detail::binary_operation(sum_result, + count_result, + binary_operator::DIV, + cudf::detail::target_type(result_type, aggregation::MEAN), + stream, + mr); + dense_results->add_result(col, agg, std::move(result)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = sparse_results->get_result(col, *sum_agg); + column_view count_result = sparse_results->get_result(col, *count_agg); + + auto values_view = column_device_view::create(col, stream); + auto sum_view = column_device_view::create(sum_result, stream); + auto count_view = column_device_view::create(count_result, stream); + + auto var_result = make_fixed_width_column( + cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); + auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); + mutable_table_view var_table_view{{var_result->mutable_view()}}; + cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + col.size(), + var_hash_functor{ + set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); + sparse_results->add_result(col, agg, std::move(var_result)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + auto var_agg = make_variance_aggregation(agg._ddof); + this->visit(*dynamic_cast(var_agg.get())); + column_view variance = dense_results->get_result(col, *var_agg); + + auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); + dense_results->add_result(col, agg, std::move(result)); +} + +template class hash_compound_agg_finalizer>; +template class hash_compound_agg_finalizer>; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp new file mode 100644 index 00000000000..8bee1a92c40 --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { + column_view col; + data_type result_type; + cudf::detail::result_cache* sparse_results; + cudf::detail::result_cache* dense_results; + device_span gather_map; + SetType set; + bitmask_type const* __restrict__ row_bitmask; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + public: + using cudf::detail::aggregation_finalizer::visit; + + hash_compound_agg_finalizer(column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + auto to_dense_agg_result(cudf::aggregation const& agg); + + // Enables conversion of ARGMIN/ARGMAX into MIN/MAX + auto gather_argminmax(cudf::aggregation const& agg); + + // Declare overloads for each kind of aggregation to dispatch + void visit(cudf::aggregation const& agg) override; + + void visit(cudf::detail::min_aggregation const& agg) override; + + void visit(cudf::detail::max_aggregation const& agg) override; + + void visit(cudf::detail::mean_aggregation const& agg) override; + + void visit(cudf::detail::var_aggregation const& agg) override; + + void visit(cudf::detail::std_aggregation const& agg) override; +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh new file mode 100644 index 00000000000..f950e03e0fb --- /dev/null +++ b/cpp/src/groupby/hash/helpers.cuh @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/// Number of threads to handle each input element +CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; + +/// Number of slots per thread +CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1; + +/// Thread block size +CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128; + +/// Threshold cardinality to switch between shared memory aggregations and global memory +/// aggregations +CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128; + +// We add additional `block_size`, because after the number of elements in the local hash set +// exceeds the threshold, all threads in the thread block can still insert one more element. +/// The maximum number of elements handled per block +CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS = + GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE; + +// GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy +/// Shared memory hash set extent type +using shmem_extent_t = + cuco::extent(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>; + +/// Number of windows needed by each shared memory hash set +CUDF_HOST_DEVICE auto constexpr window_extent = + cuco::make_window_extent(shmem_extent_t{}); + +using row_hash_t = + cudf::experimental::row::hash::device_row_hasher; + +/// Probing scheme type used by groupby hash table +using probing_scheme_t = cuco::linear_probing; + +using row_comparator_t = cudf::experimental::row::equality::device_row_comparator< + false, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_comparator< + true, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using global_set_t = cuco::static_set, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +using nullable_global_set_t = cuco::static_set, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +template +using hash_set_ref_t = cuco::static_set_ref< + cudf::size_type, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cuco::aow_storage_ref>, + Op>; + +template +using nullable_hash_set_ref_t = cuco::static_set_ref< + cudf::size_type, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cuco::aow_storage_ref>, + Op>; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh new file mode 100644 index 00000000000..9cbeeb34b86 --- /dev/null +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +struct update_target_element_shmem { + __device__ void operator()( + cuda::std::byte*, bool*, cudf::size_type, cudf::column_device_view, cudf::size_type) const + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_min(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_max(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_timestamp()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM_OF_SQUARES, + cuda::std::enable_if_t()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto value = static_cast(source.element(source_index)); + cudf::detail::atomic_add(&target_casted[target_index], value * value); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::PRODUCT, + cuda::std::enable_if_t()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_mul(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_VALID, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + // The nullability was checked prior to this call in the `shmem_element_aggregator` functor + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_ALL, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + + // Assumes target is already set to be valid + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source.element(source_index) > source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source.element(source_index) < source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +/** + * @brief A functor that updates a single element in the target column stored in shared memory by + * applying an aggregation operation to a corresponding element from a source column in global + * memory. + * + * This functor can NOT be used for dictionary columns. + * + * This is a redundant copy replicating the behavior of `elementwise_aggregator` from + * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts + * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from + * shared memory. + */ +struct shmem_element_aggregator { + template + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + // Check nullability for all aggregation kinds but `COUNT_ALL` + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source.is_null(source_index)) { return; } + } + update_target_element_shmem{}( + target, target_mask, target_index, source, source_index); + } +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh new file mode 100644 index 00000000000..048c9252773 --- /dev/null +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "helpers.cuh" + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +// TODO: TO BE REMOVED issue tracked via #17171 +template +__device__ constexpr bool is_supported() +{ + return cudf::is_fixed_width() and + ((k == cudf::aggregation::SUM) or (k == cudf::aggregation::SUM_OF_SQUARES) or + (k == cudf::aggregation::MIN) or (k == cudf::aggregation::MAX) or + (k == cudf::aggregation::COUNT_VALID) or (k == cudf::aggregation::COUNT_ALL) or + (k == cudf::aggregation::ARGMIN) or (k == cudf::aggregation::ARGMAX) or + (k == cudf::aggregation::STD) or (k == cudf::aggregation::VARIANCE) or + (k == cudf::aggregation::PRODUCT) and cudf::detail::is_product_supported()); +} + +template +__device__ std::enable_if_t, void>, T> +identity_from_operator() +{ + using DeviceType = cudf::device_storage_type_t; + return cudf::detail::corresponding_operator_t::template identity(); +} + +template +__device__ std::enable_if_t, void>, T> +identity_from_operator() +{ + CUDF_UNREACHABLE("Unable to get identity/sentinel from device operator"); +} + +template +__device__ T get_identity() +{ + if ((k == cudf::aggregation::ARGMAX) or (k == cudf::aggregation::ARGMIN)) { + if constexpr (cudf::is_timestamp()) { + return k == cudf::aggregation::ARGMAX + ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)} + : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)}; + } else { + using DeviceType = cudf::device_storage_type_t; + return k == cudf::aggregation::ARGMAX + ? static_cast(cudf::detail::ARGMAX_SENTINEL) + : static_cast(cudf::detail::ARGMIN_SENTINEL); + } + } + return identity_from_operator(); +} + +template +struct initialize_target_element { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct initialize_target_element()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + DeviceType* target_casted = reinterpret_cast(target); + + target_casted[idx] = get_identity(); + + target_mask[idx] = (k == cudf::aggregation::COUNT_ALL) or (k == cudf::aggregation::COUNT_VALID); + } +}; + +struct initialize_shmem { + template + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + initialize_target_element{}(target, target_mask, idx); + } +}; + +template +struct initialize_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view, cudf::size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_width() && + !cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +template +struct initialize_target_element_gmem< + Target, + k, + std::enable_if_t() && cudf::is_fixed_point()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + target.element(target_index) = get_identity(); + } +}; + +struct initialize_gmem { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index) const noexcept + { + initialize_target_element_gmem{}(target, target_index); + } +}; + +struct initialize_sparse_table { + cudf::size_type const* row_indices; + cudf::mutable_table_device_view sparse_table; + cudf::aggregation::Kind const* __restrict__ aggs; + initialize_sparse_table(cudf::size_type const* row_indices, + cudf::mutable_table_device_view sparse_table, + cudf::aggregation::Kind const* aggs) + : row_indices(row_indices), sparse_table(sparse_table), aggs(aggs) + { + } + __device__ void operator()(cudf::size_type i) + { + auto key_idx = row_indices[i]; + for (auto col_idx = 0; col_idx < sparse_table.num_columns(); col_idx++) { + cudf::detail::dispatch_type_and_aggregation(sparse_table.column(col_idx).type(), + aggs[col_idx], + initialize_gmem{}, + sparse_table.column(col_idx), + key_idx); + } + } +}; + +template +struct global_memory_fallback_fn { + SetType set; + cudf::table_device_view input_values; + cudf::mutable_table_device_view output_values; + cudf::aggregation::Kind const* __restrict__ aggs; + cudf::size_type* block_cardinality; + cudf::size_type stride; + bitmask_type const* __restrict__ row_bitmask; + bool skip_rows_with_nulls; + + global_memory_fallback_fn(SetType set, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* aggs, + cudf::size_type* block_cardinality, + cudf::size_type stride, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls) + : set(set), + input_values(input_values), + output_values(output_values), + aggs(aggs), + block_cardinality(block_cardinality), + stride(stride), + row_bitmask(row_bitmask), + skip_rows_with_nulls(skip_rows_with_nulls) + { + } + + __device__ void operator()(cudf::size_type i) + { + auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE; + if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) { + auto const result = set.insert_and_find(i); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + } + } +}; + +/** + * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, + * and populate `set` with indices of unique keys + * + * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If + * the index was not present in the set, insert they index and then copy it to the output. If the + * key was already present in the set, then the inserted index is aggregated with the existing row. + * This aggregation is done for every element `j` in the row by applying aggregation operation `j` + * between the new and existing element. + * + * Instead of storing the entire rows from `input_keys` and `input_values` in + * the hashset, we instead store the row indices. For example, when inserting + * row at index `i` from `input_keys` into the hash set, the value `i` is what + * gets stored for the hash set's "key". It is assumed the `set` was constructed + * with a custom comparator that uses these row indices to check for equality + * between key rows. For example, comparing two keys `k0` and `k1` will compare + * the two rows `input_keys[k0] ?= input_keys[k1]` + * + * The exact size of the result is not known a priori, but can be upper bounded + * by the number of rows in `input_keys` & `input_values`. Therefore, it is + * assumed `output_values` has sufficient storage for an equivalent number of + * rows. In this way, after all rows are aggregated, `output_values` will likely + * be "sparse", meaning that not all rows contain the result of an aggregation. + * + * @tparam SetType The type of the hash set device ref + */ +template +struct compute_single_pass_aggs_fn { + SetType set; + table_device_view input_values; + mutable_table_device_view output_values; + aggregation::Kind const* __restrict__ aggs; + bitmask_type const* __restrict__ row_bitmask; + bool skip_rows_with_nulls; + + /** + * @brief Construct a new compute_single_pass_aggs_fn functor object + * + * @param set_ref Hash set object to insert key,value pairs into. + * @param input_values The table whose rows will be aggregated in the values + * of the hash set + * @param output_values Table that stores the results of aggregating rows of + * `input_values`. + * @param aggs The set of aggregation operations to perform across the + * columns of the `input_values` rows + * @param row_bitmask Bitmask where bit `i` indicates the presence of a null + * value in row `i` of input keys. Only used if `skip_rows_with_nulls` is `true` + * @param skip_rows_with_nulls Indicates if rows in `input_keys` containing + * null values should be skipped. It `true`, it is assumed `row_bitmask` is a + * bitmask where bit `i` indicates the presence of a null value in row `i`. + */ + compute_single_pass_aggs_fn(SetType set, + table_device_view input_values, + mutable_table_device_view output_values, + aggregation::Kind const* aggs, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls) + : set(set), + input_values(input_values), + output_values(output_values), + aggs(aggs), + row_bitmask(row_bitmask), + skip_rows_with_nulls(skip_rows_with_nulls) + { + } + + __device__ void operator()(size_type i) + { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { + auto const result = set.insert_and_find(i); + + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + } + } +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu new file mode 100644 index 00000000000..e1c2cd22309 --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hash_compound_agg_finalizer.hpp" +#include "helpers.cuh" + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +void sparse_to_dense_results(host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetRef set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + auto const& col = request.values; + + // Given an aggregation, this will get the result from sparse_results and + // convert and return dense, compacted result + auto finalizer = hash_compound_agg_finalizer( + col, sparse_results, dense_results, gather_map, set, row_bitmask, stream, mr); + for (auto&& agg : agg_v) { + agg->finalize(finalizer); + } + } +} + +template void sparse_to_dense_results>( + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + hash_set_ref_t set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +template void sparse_to_dense_results>( + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + nullable_hash_set_ref_t set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp new file mode 100644 index 00000000000..3a2b3090b99 --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Gather sparse aggregation results into dense using `gather_map` and add to + * `dense_results` + * + * @tparam SetRef Device hash set ref type + * + * @param[in] requests The set of columns to aggregate and the aggregations to perform + * @param[in] sparse_results Sparse aggregation results + * @param[out] dense_results Dense aggregation results + * @param[in] gather_map Gather map indicating valid elements in `sparse_results` + * @param[in] set Device hash set ref + * @param[in] row_bitmask Bitmask indicating the validity of input keys + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param[in] mr Device memory resource used to allocate the returned table + */ +template +void sparse_to_dense_results(host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetRef set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh similarity index 69% rename from cpp/src/groupby/hash/multi_pass_kernels.cuh rename to cpp/src/groupby/hash/var_hash_functor.cuh index 7043eafdc10..bb55cc9188c 100644 --- a/cpp/src/groupby/hash/multi_pass_kernels.cuh +++ b/cpp/src/groupby/hash/var_hash_functor.cuh @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -21,17 +20,14 @@ #include #include #include -#include #include +#include #include +#include -#include - -namespace cudf { -namespace detail { - -template +namespace cudf::groupby::detail::hash { +template struct var_hash_functor { SetType set; bitmask_type const* __restrict__ row_bitmask; @@ -47,13 +43,13 @@ struct var_hash_functor { column_device_view sum, column_device_view count, size_type ddof) - : set(set), - row_bitmask(row_bitmask), - target(target), - source(source), - sum(sum), - count(count), - ddof(ddof) + : set{set}, + row_bitmask{row_bitmask}, + target{target}, + source{source}, + sum{sum}, + count{count}, + ddof{ddof} { } @@ -64,23 +60,21 @@ struct var_hash_functor { } template - __device__ std::enable_if_t()> operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept + __device__ cuda::std::enable_if_t()> operator()( + column_device_view const& source, size_type source_index, size_type target_index) noexcept { CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination."); } template - __device__ std::enable_if_t()> operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept + __device__ cuda::std::enable_if_t()> operator()( + column_device_view const& source, size_type source_index, size_type target_index) noexcept { - using Target = target_type_t; - using SumType = target_type_t; - using CountType = target_type_t; + using Target = cudf::detail::target_type_t; + using SumType = cudf::detail::target_type_t; + using CountType = cudf::detail::target_type_t; - if (source_has_nulls and source.is_null(source_index)) return; + if (source.is_null(source_index)) return; CountType group_size = count.element(target_index); if (group_size == 0 or group_size - ddof <= 0) return; @@ -91,8 +85,9 @@ struct var_hash_functor { ref.fetch_add(result, cuda::std::memory_order_relaxed); // STD sqrt is applied in finalize() - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } + __device__ inline void operator()(size_type source_index) { if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) { @@ -110,6 +105,4 @@ struct var_hash_functor { } } }; - -} // namespace detail -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index a9085a1f1fd..3041e261945 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu index 82d557b9f7e..d6c900fb689 100644 --- a/cpp/src/groupby/sort/group_quantiles.cu +++ b/cpp/src/groupby/sort/group_quantiles.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +109,7 @@ struct quantiles_functor { auto values_view = column_device_view::create(values, stream); auto group_size_view = column_device_view::create(group_sizes, stream); auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); - auto null_count = rmm::device_scalar(0, stream, mr); + auto null_count = cudf::detail::device_scalar(0, stream, mr); // For each group, calculate quantile if (!cudf::is_dictionary(values.type())) { diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 2358f47bbbb..f9adfc6060e 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 86ee20dbbe2..c3dfac46502 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -134,7 +134,7 @@ struct var_functor { // set nulls auto result_view = mutable_column_device_view::create(*result, stream); - auto null_count = rmm::device_scalar(0, stream, mr); + auto null_count = cudf::detail::device_scalar(0, stream, mr); auto d_null_count = null_count.data(); thrust::for_each_n( rmm::exec_policy(stream), diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index c7bfd4aecf4..a0c51940c87 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -302,7 +302,8 @@ std::unique_ptr md5(table_view const& input, } return md5_leaf_type_check(col.type()); }), - "Unsupported column type for hash function."); + "Unsupported column type for hash function.", + cudf::data_type_error); // Digest size in bytes auto constexpr digest_size = 32; diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index ebaec8e2775..eb002cf9c6f 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -513,7 +513,8 @@ std::unique_ptr sha_hash(table_view const& input, CUDF_EXPECTS( std::all_of( input.begin(), input.end(), [](auto const& col) { return sha_leaf_type_check(col.type()); }), - "Unsupported column type for hash function."); + "Unsupported column type for hash function.", + cudf::data_type_error); // Result column allocation and creation auto begin = thrust::make_constant_iterator(Hasher::digest_size); diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index a99262fb3bf..c69ebe12d2c 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -20,11 +20,6 @@ #include #include -#include - -#include -#include - #include namespace cudf { diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 1b79fbf9eda..e4bdedf6603 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index ba5b11b90d8..4395b741e53 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -16,11 +16,8 @@ #include #include #include -#include -#include #include #include -#include #include #include #include @@ -118,8 +115,8 @@ DLDataType data_type_to_DLDataType(data_type type) // Context object to own memory allocated for DLManagedTensor struct dltensor_context { - int64_t shape[2]; - int64_t strides[2]; + int64_t shape[2]; // NOLINT + int64_t strides[2]; // NOLINT rmm::device_buffer buffer; static void deleter(DLManagedTensor* arg) diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index a2874b46b06..fc1b0226a48 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -35,7 +36,6 @@ #include #include -#include #include #include @@ -60,7 +60,7 @@ template struct is_device_scalar : public std::false_type {}; template -struct is_device_scalar> : public std::true_type {}; +struct is_device_scalar> : public std::true_type {}; template struct is_device_uvector : public std::false_type {}; @@ -232,10 +232,10 @@ int dispatch_to_arrow_device::operator()(cudf::column&& colum // in the offsets buffer. While some arrow implementations may accept a zero-sized // offsets buffer, best practices would be to allocate the buffer with the single value. if (nanoarrow_type == NANOARROW_TYPE_STRING) { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } else { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } @@ -466,10 +466,10 @@ int dispatch_to_arrow_device_view::operator()(ArrowArray* out if (column.size() == 0) { // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552 if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } else { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu index 79fb7550044..8ec0904f1ba 100644 --- a/cpp/src/interop/to_arrow_host.cu +++ b/cpp/src/interop/to_arrow_host.cu @@ -44,6 +44,7 @@ #include #include #include +#include #include @@ -52,6 +53,30 @@ namespace detail { namespace { +/* + Enable Transparent Huge Pages (THP) for large (>4MB) allocations. + `buf` is returned untouched. + Enabling THP can improve performance of device-host memory transfers + significantly, see . +*/ +void enable_hugepage(ArrowBuffer* buffer) +{ + if (buffer->size_bytes < (1u << 22u)) { // Smaller than 4 MB + return; + } + +#ifdef MADV_HUGEPAGE + auto const pagesize = sysconf(_SC_PAGESIZE); + void* addr = const_cast(buffer->data); + auto length{static_cast(buffer->size_bytes)}; + if (std::align(pagesize, pagesize, addr, length)) { + // Intentionally not checking for errors that may be returned by older kernel versions; + // optimistically tries enabling huge pages. + madvise(addr, length, MADV_HUGEPAGE); + } +#endif +} + struct dispatch_to_arrow_host { cudf::column_view column; rmm::cuda_stream_view stream; @@ -62,6 +87,7 @@ struct dispatch_to_arrow_host { if (!column.has_nulls()) { return NANOARROW_OK; } NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast(column.size()), 0)); + enable_hugepage(&bitmap->buffer); CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data, (column.offset() > 0) ? cudf::detail::copy_bitmask(column, stream, mr).data() @@ -76,6 +102,7 @@ struct dispatch_to_arrow_host { int populate_data_buffer(device_span input, ArrowBuffer* buffer) const { NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1)); + enable_hugepage(buffer); CUDF_CUDA_TRY(cudaMemcpyAsync( buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value())); return NANOARROW_OK; diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 2041f03cd81..b3fcca62314 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -16,7 +16,7 @@ #include "avro.hpp" -#include +#include #include namespace cudf { @@ -199,7 +199,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) // Read the next sync markers and ensure they match the first ones we // encountered. If they don't, we have to assume the data is corrupted, // and thus, we terminate processing immediately. - uint64_t const sync_marker[] = {get_raw(), get_raw()}; + std::array const sync_marker = {get_raw(), get_raw()}; bool valid_sync_markers = ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1])); if (!valid_sync_markers) { return false; } @@ -302,7 +302,7 @@ bool schema_parser::parse(std::vector& schema, std::string const& // Empty schema if (json_str == "[]") return true; - char depthbuf[MAX_SCHEMA_DEPTH]; + std::array depthbuf; int depth = 0, parent_idx = -1, entry_idx = -1; json_state_e state = state_attrname; std::string str; diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp index f2813a1ba51..fd2c781b8a1 100644 --- a/cpp/src/io/avro/avro.hpp +++ b/cpp/src/io/avro/avro.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,10 +18,9 @@ #include "avro_common.hpp" -#include +#include #include #include -#include #include #include #include @@ -100,15 +99,15 @@ struct column_desc { */ struct file_metadata { std::map user_data; - std::string codec = ""; - uint64_t sync_marker[2] = {0, 0}; - size_t metadata_size = 0; - size_t total_data_size = 0; - size_t selected_data_size = 0; - size_type num_rows = 0; - size_type skip_rows = 0; - size_type total_num_rows = 0; - uint32_t max_block_size = 0; + std::string codec = ""; + std::array sync_marker = {0, 0}; + size_t metadata_size = 0; + size_t total_data_size = 0; + size_t selected_data_size = 0; + size_type num_rows = 0; + size_type skip_rows = 0; + size_type total_num_rows = 0; + uint32_t max_block_size = 0; std::vector schema; std::vector block_list; std::vector columns; diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp new file mode 100644 index 00000000000..b26a6292806 --- /dev/null +++ b/cpp/src/io/comp/comp.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "comp.hpp" + +#include "io/utilities/hostdevice_vector.hpp" +#include "nvcomp_adapter.hpp" + +#include +#include +#include +#include +#include +#include + +#include // GZIP compression + +namespace cudf::io::detail { + +namespace { + +/** + * @brief GZIP host compressor (includes header) + */ +std::vector compress_gzip(host_span src) +{ + z_stream zs; + zs.zalloc = Z_NULL; + zs.zfree = Z_NULL; + zs.opaque = Z_NULL; + zs.avail_in = src.size(); + zs.next_in = reinterpret_cast(const_cast(src.data())); + + std::vector dst; + zs.avail_out = 0; + zs.next_out = nullptr; + + int windowbits = 15; + int gzip_encoding = 16; + int ret = deflateInit2( + &zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowbits | gzip_encoding, 8, Z_DEFAULT_STRATEGY); + CUDF_EXPECTS(ret == Z_OK, "GZIP DEFLATE compression initialization failed."); + + uint32_t estcomplen = deflateBound(&zs, src.size()); + dst.resize(estcomplen); + zs.avail_out = estcomplen; + zs.next_out = dst.data(); + + ret = deflate(&zs, Z_FINISH); + CUDF_EXPECTS(ret == Z_STREAM_END, "GZIP DEFLATE compression failed due to insufficient space!"); + dst.resize(std::distance(dst.data(), zs.next_out)); + + ret = deflateEnd(&zs); + CUDF_EXPECTS(ret == Z_OK, "GZIP DEFLATE compression failed at deallocation"); + + return dst; +} + +/** + * @brief SNAPPY device compressor + */ +std::vector compress_snappy(host_span src, + rmm::cuda_stream_view stream) +{ + auto const d_src = + cudf::detail::make_device_uvector_async(src, stream, cudf::get_current_device_resource_ref()); + cudf::detail::hostdevice_vector> inputs(1, stream); + inputs[0] = d_src; + inputs.host_to_device_async(stream); + + auto dst_size = compress_max_output_chunk_size(nvcomp::compression_type::SNAPPY, src.size()); + rmm::device_uvector d_dst(dst_size, stream); + cudf::detail::hostdevice_vector> outputs(1, stream); + outputs[0] = d_dst; + outputs.host_to_device_async(stream); + + cudf::detail::hostdevice_vector hd_status(1, stream); + hd_status[0] = {}; + hd_status.host_to_device_async(stream); + + nvcomp::batched_compress(nvcomp::compression_type::SNAPPY, inputs, outputs, hd_status, stream); + + hd_status.device_to_host_sync(stream); + CUDF_EXPECTS(hd_status[0].status == cudf::io::compression_status::SUCCESS, + "snappy compression failed"); + return cudf::detail::make_std_vector_sync(d_dst, stream); +} + +} // namespace + +std::vector compress(compression_type compression, + host_span src, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + switch (compression) { + case compression_type::GZIP: return compress_gzip(src); + case compression_type::SNAPPY: return compress_snappy(src, stream); + default: CUDF_FAIL("Unsupported compression type"); + } +} + +} // namespace cudf::io::detail diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp new file mode 100644 index 00000000000..652abbbeda6 --- /dev/null +++ b/cpp/src/io/comp/comp.hpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace io::detail { + +/** + * @brief Compresses a system memory buffer. + * + * @param compression Type of compression of the input data + * @param src Decompressed host buffer + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Vector containing the Compressed output + */ +std::vector compress(compression_type compression, + host_span src, + rmm::cuda_stream_view stream); + +} // namespace io::detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index fff1cf0c96a..090ea1430b5 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -980,27 +980,27 @@ __device__ int parse_gzip_header(uint8_t const* src, size_t src_size) { uint8_t flags = src[3]; hdr_len = 10; - if (flags & GZIPHeaderFlag::fextra) // Extra fields present + if (flags & detail::GZIPHeaderFlag::fextra) // Extra fields present { int xlen = src[hdr_len] | (src[hdr_len + 1] << 8); hdr_len += xlen; if (hdr_len >= src_size) return -1; } - if (flags & GZIPHeaderFlag::fname) // Original file name present + if (flags & detail::GZIPHeaderFlag::fname) // Original file name present { // Skip zero-terminated string do { if (hdr_len >= src_size) return -1; } while (src[hdr_len++] != 0); } - if (flags & GZIPHeaderFlag::fcomment) // Comment present + if (flags & detail::GZIPHeaderFlag::fcomment) // Comment present { // Skip zero-terminated string do { if (hdr_len >= src_size) return -1; } while (src[hdr_len++] != 0); } - if (flags & GZIPHeaderFlag::fhcrc) // Header CRC present + if (flags & detail::GZIPHeaderFlag::fhcrc) // Header CRC present { hdr_len += 2; } diff --git a/cpp/src/io/comp/io_uncomp.hpp b/cpp/src/io/comp/io_uncomp.hpp index 1c9578fa5c0..ca722a9b7ee 100644 --- a/cpp/src/io/comp/io_uncomp.hpp +++ b/cpp/src/io/comp/io_uncomp.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,8 +25,8 @@ using cudf::host_span; -namespace cudf { -namespace io { +namespace CUDF_EXPORT cudf { +namespace io::detail { /** * @brief Decompresses a system memory buffer. @@ -36,13 +36,35 @@ namespace io { * * @return Vector containing the Decompressed output */ -std::vector decompress(compression_type compression, host_span src); +[[nodiscard]] std::vector decompress(compression_type compression, + host_span src); +/** + * @brief Decompresses a system memory buffer. + * + * @param compression Type of compression of the input data + * @param src Compressed host buffer + * @param dst Destination host span to place decompressed buffer + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Size of decompressed output + */ size_t decompress(compression_type compression, host_span src, host_span dst, rmm::cuda_stream_view stream); +/** + * @brief Without actually decompressing the compressed input buffer passed, return the size of + * decompressed output. If the decompressed size cannot be extracted apriori, return zero. + * + * @param compression Type of compression of the input data + * @param src Compressed host buffer + * + * @return Size of decompressed output + */ +size_t get_uncompressed_size(compression_type compression, host_span src); + /** * @brief GZIP header flags * See https://tools.ietf.org/html/rfc1952 @@ -55,5 +77,5 @@ constexpr uint8_t fname = 0x08; // Original file name present constexpr uint8_t fcomment = 0x10; // Comment present }; // namespace GZIPHeaderFlag -} // namespace io -} // namespace cudf +} // namespace io::detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 583bd6a3523..2e1cda2d6b7 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -18,9 +18,7 @@ #include "gpuinflate.hpp" -#include #include -#include #include #include diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 602ff1734b6..b3d43fa786a 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -24,25 +24,22 @@ #include #include -#include - #include // uncompress #include // memset -using cudf::host_span; - -namespace cudf { -namespace io { +namespace cudf::io::detail { #pragma pack(push, 1) +namespace { + struct gz_file_header_s { uint8_t id1; // 0x1f uint8_t id2; // 0x8b uint8_t comp_mthd; // compression method (0-7=reserved, 8=deflate) uint8_t flags; // flags (GZIPHeaderFlag) - uint8_t mtime[4]; // If non-zero: modification time (Unix format) + uint8_t mtime[4]; // If non-zero: modification time (Unix format) // NOLINT uint8_t xflags; // Extra compressor-specific flags uint8_t os; // OS id }; @@ -103,7 +100,7 @@ struct zip_lfh_s { }; struct bz2_file_header_s { - uint8_t sig[3]; // "BZh" + uint8_t sig[3]; // "BZh" // NOLINT uint8_t blksz; // block size 1..9 in 100kB units (post-RLE) }; @@ -263,7 +260,7 @@ void cpu_inflate_vector(std::vector& dst, uint8_t const* comp_data, siz strm.avail_out = dst.size(); strm.total_out = 0; auto zerr = inflateInit2(&strm, -15); // -15 for raw data without GZIP headers - CUDF_EXPECTS(zerr == 0, "Error in DEFLATE stream"); + CUDF_EXPECTS(zerr == 0, "Error in DEFLATE stream: inflateInit2 failed"); do { if (strm.avail_out == 0) { dst.resize(strm.total_out + (1 << 30)); @@ -275,125 +272,7 @@ void cpu_inflate_vector(std::vector& dst, uint8_t const* comp_data, siz strm.total_out == dst.size()); dst.resize(strm.total_out); inflateEnd(&strm); - CUDF_EXPECTS(zerr == Z_STREAM_END, "Error in DEFLATE stream"); -} - -std::vector decompress(compression_type compression, host_span src) -{ - CUDF_EXPECTS(src.data() != nullptr, "Decompression: Source cannot be nullptr"); - CUDF_EXPECTS(not src.empty(), "Decompression: Source size cannot be 0"); - - auto raw = src.data(); - uint8_t const* comp_data = nullptr; - size_t comp_len = 0; - size_t uncomp_len = 0; - - switch (compression) { - case compression_type::AUTO: - case compression_type::GZIP: { - gz_archive_s gz; - if (ParseGZArchive(&gz, raw, src.size())) { - compression = compression_type::GZIP; - comp_data = gz.comp_data; - comp_len = gz.comp_len; - uncomp_len = gz.isize; - } - if (compression != compression_type::AUTO) break; - [[fallthrough]]; - } - case compression_type::ZIP: { - zip_archive_s za; - if (OpenZipArchive(&za, raw, src.size())) { - size_t cdfh_ofs = 0; - for (int i = 0; i < za.eocd->num_entries; i++) { - auto const* cdfh = reinterpret_cast( - reinterpret_cast(za.cdfh) + cdfh_ofs); - int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len; - if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) { - // Bad cdir - break; - } - // For now, only accept with non-zero file sizes and DEFLATE - if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) { - size_t lfh_ofs = cdfh->hdr_ofs; - auto const* lfh = reinterpret_cast(raw + lfh_ofs); - if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 && - lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) { - if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) { - size_t file_start = lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len; - size_t file_end = file_start + lfh->comp_size; - if (file_end <= src.size()) { - // Pick the first valid file of non-zero size (only 1 file expected in archive) - compression = compression_type::ZIP; - comp_data = raw + file_start; - comp_len = lfh->comp_size; - uncomp_len = lfh->uncomp_size; - break; - } - } - } - } - cdfh_ofs += cdfh_len; - } - } - } - if (compression != compression_type::AUTO) break; - [[fallthrough]]; - case compression_type::BZIP2: - if (src.size() > 4) { - auto const* fhdr = reinterpret_cast(raw); - // Check for BZIP2 file signature "BZh1" to "BZh9" - if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' && - fhdr->blksz >= '1' && fhdr->blksz <= '9') { - compression = compression_type::BZIP2; - comp_data = raw; - comp_len = src.size(); - uncomp_len = 0; - } - } - if (compression != compression_type::AUTO) break; - [[fallthrough]]; - default: CUDF_FAIL("Unsupported compressed stream type"); - } - - CUDF_EXPECTS(comp_data != nullptr and comp_len > 0, "Unsupported compressed stream type"); - - if (uncomp_len <= 0) { - uncomp_len = comp_len * 4 + 4096; // In case uncompressed size isn't known in advance, assume - // ~4:1 compression for initial size - } - - if (compression == compression_type::GZIP || compression == compression_type::ZIP) { - // INFLATE - std::vector dst(uncomp_len); - cpu_inflate_vector(dst, comp_data, comp_len); - return dst; - } - if (compression == compression_type::BZIP2) { - size_t src_ofs = 0; - size_t dst_ofs = 0; - int bz_err = 0; - std::vector dst(uncomp_len); - do { - size_t dst_len = uncomp_len - dst_ofs; - bz_err = cpu_bz2_uncompress(comp_data, comp_len, dst.data() + dst_ofs, &dst_len, &src_ofs); - if (bz_err == BZ_OUTBUFF_FULL) { - // TBD: We could infer the compression ratio based on produced/consumed byte counts - // in order to minimize realloc events and over-allocation - dst_ofs = dst_len; - dst_len = uncomp_len + (uncomp_len / 2); - dst.resize(dst_len); - uncomp_len = dst_len; - } else if (bz_err == 0) { - uncomp_len = dst_len; - dst.resize(uncomp_len); - } - } while (bz_err == BZ_OUTBUFF_FULL); - CUDF_EXPECTS(bz_err == 0, "Decompression: error in stream"); - return dst; - } - - CUDF_FAIL("Unsupported compressed stream type"); + CUDF_EXPECTS(zerr == Z_STREAM_END, "Error in DEFLATE stream: Z_STREAM_END not encountered"); } /** @@ -538,12 +417,130 @@ size_t decompress_zstd(host_span src, CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed"); // Copy temporary output to `dst` - CUDF_CUDA_TRY(cudaMemcpyAsync( - dst.data(), d_dst.data(), hd_stats[0].bytes_written, cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy(dst.subspan(0, hd_stats[0].bytes_written), + device_span{d_dst.data(), hd_stats[0].bytes_written}, + stream); return hd_stats[0].bytes_written; } +struct source_properties { + compression_type compression = compression_type::NONE; + uint8_t const* comp_data = nullptr; + size_t comp_len = 0; + size_t uncomp_len = 0; +}; + +source_properties get_source_properties(compression_type compression, host_span src) +{ + auto raw = src.data(); + uint8_t const* comp_data = nullptr; + size_t comp_len = 0; + size_t uncomp_len = 0; + + switch (compression) { + case compression_type::AUTO: + case compression_type::GZIP: { + gz_archive_s gz; + auto const parse_succeeded = ParseGZArchive(&gz, src.data(), src.size()); + CUDF_EXPECTS(parse_succeeded, "Failed to parse GZIP header while fetching source properties"); + compression = compression_type::GZIP; + comp_data = gz.comp_data; + comp_len = gz.comp_len; + uncomp_len = gz.isize; + if (compression != compression_type::AUTO) break; + [[fallthrough]]; + } + case compression_type::ZIP: { + zip_archive_s za; + if (OpenZipArchive(&za, raw, src.size())) { + size_t cdfh_ofs = 0; + for (int i = 0; i < za.eocd->num_entries; i++) { + auto const* cdfh = reinterpret_cast( + reinterpret_cast(za.cdfh) + cdfh_ofs); + int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len; + if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) { + // Bad cdir + break; + } + // For now, only accept with non-zero file sizes and DEFLATE + if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) { + size_t lfh_ofs = cdfh->hdr_ofs; + auto const* lfh = reinterpret_cast(raw + lfh_ofs); + if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 && + lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) { + if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) { + size_t file_start = lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len; + size_t file_end = file_start + lfh->comp_size; + if (file_end <= src.size()) { + // Pick the first valid file of non-zero size (only 1 file expected in archive) + compression = compression_type::ZIP; + comp_data = raw + file_start; + comp_len = lfh->comp_size; + uncomp_len = lfh->uncomp_size; + break; + } + } + } + } + cdfh_ofs += cdfh_len; + } + } + if (compression != compression_type::AUTO) break; + [[fallthrough]]; + } + case compression_type::BZIP2: { + if (src.size() > 4) { + auto const* fhdr = reinterpret_cast(raw); + // Check for BZIP2 file signature "BZh1" to "BZh9" + if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' && + fhdr->blksz >= '1' && fhdr->blksz <= '9') { + compression = compression_type::BZIP2; + comp_data = raw; + comp_len = src.size(); + uncomp_len = 0; + } + } + if (compression != compression_type::AUTO) break; + [[fallthrough]]; + } + case compression_type::SNAPPY: { + uncomp_len = 0; + auto cur = src.begin(); + auto const end = src.end(); + // Read uncompressed length (varint) + { + uint32_t l = 0, c; + do { + c = *cur++; + auto const lo7 = c & 0x7f; + if (l >= 28 && c > 0xf) { + uncomp_len = 0; + break; + } + uncomp_len |= lo7 << l; + l += 7; + } while (c > 0x7f && cur < end); + CUDF_EXPECTS(uncomp_len != 0 and cur < end, "Error in retrieving SNAPPY source properties"); + } + comp_data = raw; + comp_len = src.size(); + if (compression != compression_type::AUTO) break; + [[fallthrough]]; + } + default: CUDF_FAIL("Unsupported compressed stream type"); + } + + return source_properties{compression, comp_data, comp_len, uncomp_len}; +} + +} // namespace + +size_t get_uncompressed_size(compression_type compression, host_span src) +{ + return get_source_properties(compression, src).uncomp_len; +} + size_t decompress(compression_type compression, host_span src, host_span dst, @@ -558,5 +555,63 @@ size_t decompress(compression_type compression, } } -} // namespace io -} // namespace cudf +std::vector decompress(compression_type compression, host_span src) +{ + CUDF_EXPECTS(src.data() != nullptr, "Decompression: Source cannot be nullptr"); + CUDF_EXPECTS(not src.empty(), "Decompression: Source size cannot be 0"); + + auto srcprops = get_source_properties(compression, src); + CUDF_EXPECTS(srcprops.comp_data != nullptr and srcprops.comp_len > 0, + "Unsupported compressed stream type"); + + if (srcprops.uncomp_len <= 0) { + srcprops.uncomp_len = + srcprops.comp_len * 4 + 4096; // In case uncompressed size isn't known in advance, assume + // ~4:1 compression for initial size + } + + if (compression == compression_type::GZIP) { + // INFLATE + std::vector dst(srcprops.uncomp_len); + decompress_gzip(src, dst); + return dst; + } + if (compression == compression_type::ZIP) { + std::vector dst(srcprops.uncomp_len); + cpu_inflate_vector(dst, srcprops.comp_data, srcprops.comp_len); + return dst; + } + if (compression == compression_type::BZIP2) { + size_t src_ofs = 0; + size_t dst_ofs = 0; + int bz_err = 0; + std::vector dst(srcprops.uncomp_len); + do { + size_t dst_len = srcprops.uncomp_len - dst_ofs; + bz_err = cpu_bz2_uncompress( + srcprops.comp_data, srcprops.comp_len, dst.data() + dst_ofs, &dst_len, &src_ofs); + if (bz_err == BZ_OUTBUFF_FULL) { + // TBD: We could infer the compression ratio based on produced/consumed byte counts + // in order to minimize realloc events and over-allocation + dst_ofs = dst_len; + dst_len = srcprops.uncomp_len + (srcprops.uncomp_len / 2); + dst.resize(dst_len); + srcprops.uncomp_len = dst_len; + } else if (bz_err == 0) { + srcprops.uncomp_len = dst_len; + dst.resize(srcprops.uncomp_len); + } + } while (bz_err == BZ_OUTBUFF_FULL); + CUDF_EXPECTS(bz_err == 0, "Decompression: error in stream"); + return dst; + } + if (compression == compression_type::SNAPPY) { + std::vector dst(srcprops.uncomp_len); + decompress_snappy(src, dst); + return dst; + } + + CUDF_FAIL("Unsupported compressed stream type"); +} + +} // namespace cudf::io::detail diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index ebca334a715..6c84b53db46 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -21,6 +21,7 @@ #include "csv_common.hpp" #include "csv_gpu.hpp" +#include "cudf/detail/utilities/cuda_memcpy.hpp" #include "io/comp/io_uncomp.hpp" #include "io/utilities/column_buffer.hpp" #include "io/utilities/hostdevice_vector.hpp" @@ -46,11 +47,8 @@ #include #include -#include #include -#include #include -#include #include #include #include @@ -88,7 +86,7 @@ class selected_rows_offsets { : all{std::move(data)}, selected{selected_span} { } - selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} + explicit selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} operator device_span() const { return selected; } void shrink(size_t size) @@ -120,47 +118,41 @@ string removeQuotes(string str, char quotechar) } /** - * @brief Parse the first row to set the column names in the raw_csv parameter. - * The first row can be either the header row, or the first data row + * @brief Parse a row of input to get the column names. The row can either be the header, or the + * first data row. If the header is not used, column names are generated automatically. */ -std::vector get_column_names(std::vector const& header, +std::vector get_column_names(std::vector const& row, parse_options_view const& parse_opts, int header_row, std::string prefix) { - std::vector col_names; - - // If there is only a single character then it would be the terminator - if (header.size() <= 1) { return col_names; } - - std::vector first_row = header; + // Empty row, return empty column names vector + if (row.empty()) { return {}; } + std::vector col_names; bool quotation = false; - for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) { + for (size_t pos = 0, prev = 0; pos < row.size(); ++pos) { // Flip the quotation flag if current character is a quotechar - if (first_row[pos] == parse_opts.quotechar) { - quotation = !quotation; - } + if (row[pos] == parse_opts.quotechar) { quotation = !quotation; } // Check if end of a column/row - else if (pos == first_row.size() - 1 || - (!quotation && first_row[pos] == parse_opts.terminator) || - (!quotation && first_row[pos] == parse_opts.delimiter)) { + if (pos == row.size() - 1 || (!quotation && row[pos] == parse_opts.terminator) || + (!quotation && row[pos] == parse_opts.delimiter)) { // This is the header, add the column name if (header_row >= 0) { // Include the current character, in case the line is not terminated int col_name_len = pos - prev + 1; // Exclude the delimiter/terminator is present - if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) { + if (row[pos] == parse_opts.delimiter || row[pos] == parse_opts.terminator) { --col_name_len; } // Also exclude '\r' character at the end of the column name if it's // part of the terminator - if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' && - first_row[pos - 1] == '\r') { + if (col_name_len > 0 && parse_opts.terminator == '\n' && row[pos] == '\n' && + row[pos - 1] == '\r') { --col_name_len; } - string const new_col_name(first_row.data() + prev, col_name_len); + string const new_col_name(row.data() + prev, col_name_len); col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar)); } else { // This is the first data row, add the automatically generated name @@ -168,14 +160,14 @@ std::vector get_column_names(std::vector const& header, } // Stop parsing when we hit the line terminator; relevant when there is - // a blank line following the header. In this case, first_row includes + // a blank line following the header. In this case, row includes // multiple line terminators at the end, as the new recStart belongs to // a line that comes after the blank line(s) - if (!quotation && first_row[pos] == parse_opts.terminator) { break; } + if (!quotation && row[pos] == parse_opts.terminator) { break; } // Skip adjacent delimiters if delim_whitespace is set - while (parse_opts.multi_delimiter && pos < first_row.size() && - first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) { + while (parse_opts.multi_delimiter && pos < row.size() && row[pos] == parse_opts.delimiter && + row[pos + 1] == parse_opts.delimiter) { ++pos; } prev = pos + 1; @@ -196,15 +188,11 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) container.resize(1, stream); } -size_t find_first_row_start(char row_terminator, host_span data) +constexpr std::array UTF8_BOM = {0xEF, 0xBB, 0xBF}; +[[nodiscard]] bool has_utf8_bom(host_span data) { - // For now, look for the first terminator (assume the first terminator isn't within a quote) - // TODO: Attempt to infer this from the data - size_t pos = 0; - while (pos < data.size() && data[pos] != row_terminator) { - ++pos; - } - return std::min(pos + 1, data.size()); + return data.size() >= UTF8_BOM.size() && + memcmp(data.data(), UTF8_BOM.data(), UTF8_BOM.size()) == 0; } /** @@ -213,20 +201,28 @@ size_t find_first_row_start(char row_terminator, host_span data) * This function scans the input data to record the row offsets (relative to the start of the * input data). A row is actually the data/offset between two termination symbols. * - * @param data Uncompressed input data in host memory - * @param range_begin Only include rows starting after this position - * @param range_end Only include rows starting before this position - * @param skip_rows Number of rows to skip from the start - * @param num_rows Number of rows to read; -1: all remaining data - * @param load_whole_file Hint that the entire data will be needed on gpu - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Input data and row offsets in the device memory + * @param[in] source The source data (may be compressed) + * @param[in] reader_opts Settings for controlling reading behavior + * @param[in] parse_opts Settings for controlling parsing behavior + * @param[out] header The header row, if any + * @param[in] data Host buffer containing uncompressed data, if input is compressed + * @param[in] byte_range_offset Offset of the byte range + * @param[in] range_begin Start of the first row, relative to the byte range start + * @param[in] range_end End of the data to read, relative to the byte range start; equal to the + * data size if all data after byte_range_offset needs to be read + * @param[in] skip_rows Number of rows to skip from the start + * @param[in] num_rows Number of rows to read; -1 means all + * @param[in] load_whole_file Indicates if the whole file should be read + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @return Input data and row offsets in the device memory */ std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( + cudf::io::datasource* source, csv_reader_options const& reader_opts, parse_options const& parse_opts, std::vector& header, - host_span data, + std::optional> data, + size_t byte_range_offset, size_t range_begin, size_t range_end, size_t skip_rows, @@ -235,56 +231,83 @@ std::pair, selected_rows_offsets> load_data_and_gather rmm::cuda_stream_view stream) { constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB - size_t buffer_size = std::min(max_chunk_bytes, data.size()); - size_t max_blocks = - std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); - cudf::detail::hostdevice_vector row_ctx(max_blocks, stream); - size_t buffer_pos = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size()); - size_t pos = std::min(range_begin, data.size()); - size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; - uint64_t ctx = 0; + + auto const data_size = data.has_value() ? data->size() : source->size(); + auto const buffer_size = std::min(max_chunk_bytes, data_size); + auto const max_input_size = [&] { + if (range_end == data_size) { + return data_size - byte_range_offset; + } else { + return std::min(reader_opts.get_byte_range_size_with_padding(), + data_size - byte_range_offset); + } + }(); + auto const header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; // For compatibility with the previous parser, a row is considered in-range if the // previous row terminator is within the given range - range_end += (range_end < data.size()); + range_end += (range_end < data_size); - // Reserve memory by allocating and then resetting the size - rmm::device_uvector d_data{ - (load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()), stream}; - d_data.resize(0, stream); + auto pos = range_begin; + // When using byte range, need the line terminator of last line before the range + auto input_pos = byte_range_offset == 0 ? pos : pos - 1; + uint64_t ctx = 0; + + rmm::device_uvector d_data{0, stream}; + d_data.reserve((load_whole_file) ? data_size : std::min(buffer_size * 2, max_input_size), stream); rmm::device_uvector all_row_offsets{0, stream}; + + auto const max_blocks = + std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); + cudf::detail::hostdevice_vector row_ctx(max_blocks, stream); do { - size_t target_pos = std::min(pos + max_chunk_bytes, data.size()); - size_t chunk_size = target_pos - pos; + auto const target_pos = std::min(pos + max_chunk_bytes, max_input_size); + auto const chunk_size = target_pos - pos; auto const previous_data_size = d_data.size(); - d_data.resize(target_pos - buffer_pos, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size, - data.begin() + buffer_pos + previous_data_size, - target_pos - buffer_pos - previous_data_size, - cudaMemcpyDefault, - stream.value())); + d_data.resize(target_pos - input_pos, stream); + + auto const read_offset = byte_range_offset + input_pos + previous_data_size; + auto const read_size = target_pos - input_pos - previous_data_size; + if (data.has_value()) { + cudf::detail::cuda_memcpy_async( + device_span{d_data.data() + previous_data_size, read_size}, + data->subspan(read_offset, read_size), + stream); + } else { + if (source->is_device_read_preferred(read_size)) { + source->device_read(read_offset, + read_size, + reinterpret_cast(d_data.data() + previous_data_size), + stream); + } else { + auto const buffer = source->host_read(read_offset, read_size); + // Use sync version to prevent buffer going out of scope before we copy the data. + cudf::detail::cuda_memcpy( + device_span{d_data.data() + previous_data_size, read_size}, + host_span{reinterpret_cast(buffer->data()), buffer->size()}, + stream); + } + } // Pass 1: Count the potential number of rows in each character block for each // possible parser state at the beginning of the block. - uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), - row_ctx.device_ptr(), - device_span(), - d_data, - chunk_size, - pos, - buffer_pos, - data.size(), - range_begin, - range_end, - skip_rows, - stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_ctx.device_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + auto const num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), + row_ctx.device_ptr(), + device_span(), + d_data, + chunk_size, + pos, + input_pos, + max_input_size, + range_begin, + range_end, + skip_rows, + stream); + + cudf::detail::cuda_memcpy(host_span{row_ctx}.subspan(0, num_blocks), + device_span{row_ctx}.subspan(0, num_blocks), + stream); // Sum up the rows in each character block, selecting the row count that // corresponds to the current input context. Also stores the now known input @@ -299,11 +322,9 @@ std::pair, selected_rows_offsets> load_data_and_gather // At least one row in range in this batch all_row_offsets.resize(total_rows - skip_rows, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(), - row_ctx.host_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async(device_span{row_ctx}.subspan(0, num_blocks), + host_span{row_ctx}.subspan(0, num_blocks), + stream); // Pass 2: Output row offsets cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), @@ -312,20 +333,17 @@ std::pair, selected_rows_offsets> load_data_and_gather d_data, chunk_size, pos, - buffer_pos, - data.size(), + input_pos, + max_input_size, range_begin, range_end, skip_rows, stream); // With byte range, we want to keep only one row out of the specified range - if (range_end < data.size()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_ctx.device_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + if (range_end < data_size) { + cudf::detail::cuda_memcpy(host_span{row_ctx}.subspan(0, num_blocks), + device_span{row_ctx}.subspan(0, num_blocks), + stream); size_t rows_out_of_range = 0; for (uint32_t i = 0; i < num_blocks; i++) { @@ -356,31 +374,37 @@ std::pair, selected_rows_offsets> load_data_and_gather size_t discard_bytes = std::max(d_data.size(), sizeof(char)) - sizeof(char); if (discard_bytes != 0) { erase_except_last(d_data, stream); - buffer_pos += discard_bytes; + input_pos += discard_bytes; } } pos = target_pos; - } while (pos < data.size()); + } while (pos < max_input_size); auto const non_blank_row_offsets = io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream); auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets}; // Remove header rows and extract header - size_t const header_row_index = std::max(header_rows, 1) - 1; + auto const header_row_index = std::max(header_rows, 1) - 1; if (header_row_index + 1 < row_offsets.size()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_offsets.data() + header_row_index, - 2 * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - - auto const header_start = buffer_pos + row_ctx[0]; - auto const header_end = buffer_pos + row_ctx[1]; - CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(), + cudf::detail::cuda_memcpy(host_span{row_ctx}.subspan(0, 2), + device_span{row_offsets.data() + header_row_index, 2}, + stream); + + auto const header_start = input_pos + row_ctx[0]; + auto const header_end = input_pos + row_ctx[1]; + CUDF_EXPECTS(header_start <= header_end && header_end <= max_input_size, "Invalid csv header location"); - header.assign(data.begin() + header_start, data.begin() + header_end); + header.resize(header_end - header_start); + if (data.has_value()) { + std::copy(data->begin() + byte_range_offset + header_start, + data->begin() + byte_range_offset + header_end, + header.begin()); + } else { + source->host_read(header_start + byte_range_offset, + header_end - header_start, + reinterpret_cast(header.data())); + } if (header_rows > 0) { row_offsets.erase_first_n(header_rows); } } // Apply num_rows limit @@ -397,73 +421,89 @@ std::pair, selected_rows_offsets> select_data_and_row_ parse_options const& parse_opts, rmm::cuda_stream_view stream) { - auto range_offset = reader_opts.get_byte_range_offset(); - auto range_size = reader_opts.get_byte_range_size(); - auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - auto skip_rows = reader_opts.get_skiprows(); - auto skip_end_rows = reader_opts.get_skipfooter(); - auto num_rows = reader_opts.get_nrows(); + auto range_offset = reader_opts.get_byte_range_offset(); + auto range_size = reader_opts.get_byte_range_size(); + auto skip_rows = reader_opts.get_skiprows(); + auto skip_end_rows = reader_opts.get_skipfooter(); + auto num_rows = reader_opts.get_nrows(); if (range_offset > 0 || range_size > 0) { CUDF_EXPECTS(reader_opts.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } + // TODO: Allow parsing the header outside the mapped range + CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0), + "byte_range offset with header not supported"); - // Transfer source data to GPU - if (!source->is_empty()) { - auto buffer = - source->host_read(range_offset, range_size_padded != 0 ? range_size_padded : source->size()); - auto h_data = - host_span(reinterpret_cast(buffer->data()), buffer->size()); - - std::vector h_uncomp_data_owner; - if (reader_opts.get_compression() != compression_type::NONE) { - h_uncomp_data_owner = - decompress(reader_opts.get_compression(), {buffer->data(), buffer->size()}); - h_data = {reinterpret_cast(h_uncomp_data_owner.data()), - h_uncomp_data_owner.size()}; - buffer.reset(); - } + if (source->is_empty()) { + return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; + } - // check for and skip UTF-8 BOM - uint8_t const UTF8_BOM[] = {0xEF, 0xBB, 0xBF}; - if (h_data.size() >= sizeof(UTF8_BOM) && - memcmp(h_data.data(), UTF8_BOM, sizeof(UTF8_BOM)) == 0) { - h_data = h_data.subspan(sizeof(UTF8_BOM), h_data.size() - sizeof(UTF8_BOM)); - } + std::optional> h_data; + std::vector h_uncomp_data_owner; + if (reader_opts.get_compression() != compression_type::NONE) { + auto const h_comp_data = source->host_read(0, source->size()); + h_uncomp_data_owner = + decompress(reader_opts.get_compression(), {h_comp_data->data(), h_comp_data->size()}); + h_data = host_span{reinterpret_cast(h_uncomp_data_owner.data()), + h_uncomp_data_owner.size()}; + } - // None of the parameters for row selection is used, we are parsing the entire file - bool const load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 && - skip_end_rows <= 0 && num_rows == -1; - - // With byte range, find the start of the first data row - size_t const data_start_offset = - (range_offset != 0) ? find_first_row_start(parse_opts.terminator, h_data) : 0; - - // TODO: Allow parsing the header outside the mapped range - CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0), - "byte_range offset with header not supported"); - - // Gather row offsets - auto data_row_offsets = - load_data_and_gather_row_offsets(reader_opts, - parse_opts, - header, - h_data, - data_start_offset, - (range_size) ? range_size : h_data.size(), - (skip_rows > 0) ? skip_rows : 0, - num_rows, - load_whole_file, - stream); - auto& row_offsets = data_row_offsets.second; - // Exclude the rows that are to be skipped from the end - if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { - row_offsets.shrink(row_offsets.size() - skip_end_rows); + size_t data_start_offset = range_offset; + if (h_data.has_value()) { + if (has_utf8_bom(*h_data)) { data_start_offset += sizeof(UTF8_BOM); } + } else { + if (range_offset == 0) { + auto bom_buffer = source->host_read(0, std::min(source->size(), sizeof(UTF8_BOM))); + auto bom_chars = host_span{reinterpret_cast(bom_buffer->data()), + bom_buffer->size()}; + if (has_utf8_bom(bom_chars)) { data_start_offset += sizeof(UTF8_BOM); } + } else { + auto find_data_start_chunk_size = 1024ul; + while (data_start_offset < source->size()) { + auto const read_size = + std::min(find_data_start_chunk_size, source->size() - data_start_offset); + auto buffer = source->host_read(data_start_offset, read_size); + auto buffer_chars = + host_span{reinterpret_cast(buffer->data()), buffer->size()}; + + if (auto first_row_start = + std::find(buffer_chars.begin(), buffer_chars.end(), parse_opts.terminator); + first_row_start != buffer_chars.end()) { + data_start_offset += std::distance(buffer_chars.begin(), first_row_start) + 1; + break; + } + data_start_offset += read_size; + find_data_start_chunk_size *= 2; + } } - return data_row_offsets; } - return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; + + // None of the parameters for row selection is used, we are parsing the entire file + bool const load_whole_file = + range_offset == 0 && range_size == 0 && skip_rows <= 0 && skip_end_rows <= 0 && num_rows == -1; + + // Transfer source data to GPU and gather row offsets + auto const uncomp_size = h_data.has_value() ? h_data->size() : source->size(); + auto data_row_offsets = load_data_and_gather_row_offsets(source, + reader_opts, + parse_opts, + header, + h_data, + range_offset, + data_start_offset - range_offset, + (range_size) ? range_size : uncomp_size, + (skip_rows > 0) ? skip_rows : 0, + num_rows, + load_whole_file, + stream); + auto& row_offsets = data_row_offsets.second; + // Exclude the rows that are to be skipped from the end + if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { + row_offsets.shrink(row_offsets.size() - skip_end_rows); + } + + return data_row_offsets; } void select_data_types(host_span user_dtypes, diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index b84446b5f3e..2bbe05ced84 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -405,13 +406,8 @@ void write_chunked(data_sink* out_sink, out_sink->device_write(ptr_all_bytes, total_num_bytes, stream); } else { // copy the bytes to host to write them out - thrust::host_vector h_bytes(total_num_bytes); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(), - ptr_all_bytes, - total_num_bytes * sizeof(char), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + auto const h_bytes = cudf::detail::make_host_vector_sync( + device_span{ptr_all_bytes, total_num_bytes}, stream); out_sink->host_write(h_bytes.data(), total_num_bytes); } diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 0ca54da5aaf..ceaeb5d8f85 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -32,12 +32,11 @@ #include #include #include -#include #include #include -#include #include +#include namespace cudf::io { @@ -121,14 +120,14 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( namespace { std::vector> make_datasources(source_info const& info, - size_t range_offset = 0, - size_t range_size = 0) + size_t offset = 0, + size_t max_size_estimate = 0) { switch (info.type()) { case io_type::FILEPATH: { auto sources = std::vector>(); for (auto const& filepath : info.filepaths()) { - sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); + sources.emplace_back(cudf::io::datasource::create(filepath, offset, max_size_estimate)); } return sources; } @@ -852,8 +851,8 @@ void parquet_writer_options_base::set_sorting_columns(std::vector #include #include +#include #include #include #include @@ -134,12 +135,13 @@ std::vector copy_strings_to_host_sync( // build std::string vector from chars and offsets std::vector host_data; host_data.reserve(col.size()); - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - std::back_inserter(host_data), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + std::transform(std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&h_chars](auto start, auto end) { + return std::string(h_chars.data() + start, end - start); + }); return host_data; }; return to_host(d_column_names->view()); @@ -170,643 +172,78 @@ rmm::device_uvector is_all_nulls_each_column(device_span rmm::device_uvector is_all_nulls(num_cols, stream); thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true); - auto parse_opt = parsing_options(options, stream); - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - num_nodes, - [options = parse_opt.view(), - data = input.data(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { - auto const node_category = column_categories[col_ids[i]]; - if (node_category == NC_STR or node_category == NC_VAL) { - auto const is_null_literal = serialized_trie_contains( - options.trie_na, - {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); - if (!is_null_literal) is_all_nulls[col_ids[i]] = false; - } - }); - return is_all_nulls; -} - -NodeIndexT get_row_array_parent_col_id(device_span col_ids, - bool is_enabled_lines, - rmm::cuda_stream_view stream) -{ - NodeIndexT value = parent_node_sentinel; - if (!col_ids.empty()) { - auto const list_node_index = is_enabled_lines ? 0 : 1; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } - return value; -} -/** - * @brief Holds member data pointers of `d_json_column` - * - */ -struct json_column_data { - using row_offset_t = json_column::row_offset_t; - row_offset_t* string_offsets; - row_offset_t* string_lengths; - row_offset_t* child_offsets; - bitmask_type* validity; -}; - -using hashmap_of_device_columns = - std::unordered_map>; - -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -void scatter_offsets(tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t const& d_column_tree, - host_span ignore_vals, - hashmap_of_device_columns const& columns, - rmm::cuda_stream_view stream); - -/** - * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are inserted into `root`'s children. - * `root` must be a list type. - * - * @param input Input JSON string device data - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param row_offsets Row offsets of the nodes in the tree - * @param root Root node of the `d_json_column` tree - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param options Parsing options specifying the parsing behaviour - * options affecting behaviour are - * is_enabled_lines: Whether the input is a line-delimited JSON - * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the device memory - * of child_offets and validity members of `d_json_column` - */ -void make_device_json_column(device_span input, - tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - bool const is_enabled_lines = options.is_enabled_lines(); - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - // make a copy - auto sorted_col_ids = cudf::detail::make_device_uvector_async( - col_ids, stream, cudf::get_current_device_resource_ref()); - - // sort by {col_id} on {node_ids} stable - rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - node_ids.begin()); - - NodeIndexT const row_array_parent_col_id = - get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); - - // 1. gather column information. - auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = - reduce_to_column_tree(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); - auto num_columns = d_unique_col_ids.size(); - std::vector column_names = copy_strings_to_host_sync( - input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - // array of arrays column names - if (is_array_of_arrays) { - auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; - auto values_column_indices = - get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); - auto h_values_column_indices = - cudf::detail::make_host_vector_sync(values_column_indices, stream); - std::transform(unique_col_ids.begin(), - unique_col_ids.end(), - column_names.cbegin(), - column_names.begin(), - [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( - auto col_id, auto name) mutable { - return column_parent_ids[col_id] == row_array_parent_col_id - ? std::to_string(h_values_column_indices[col_id]) - : name; - }); - } - - auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { - if (is_enabled_mixed_types_as_string) { - return cudf::detail::make_std_vector_sync( - is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); - } - return std::vector(); - }(); - auto const [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); - - scatter_offsets(tree, - col_ids, - row_offsets, - node_ids, - sorted_col_ids, - d_column_tree, - ignore_vals, - columns, - stream); -} - -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto column_categories = - cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - auto column_range_beg = - cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); - auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); - auto num_columns = d_unique_col_ids.size(); - stream.synchronize(); - - auto to_json_col_type = [](auto category) { - switch (category) { - case NC_STRUCT: return json_col_t::StructColumn; - case NC_LIST: return json_col_t::ListColumn; - case NC_STR: [[fallthrough]]; - case NC_VAL: return json_col_t::StringColumn; - default: return json_col_t::Unknown; - } - }; - auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); - }; - - auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { - if (column_category == NC_ERR || column_category == NC_FN) { - return; - } else if (column_category == NC_VAL || column_category == NC_STR) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - } else if (column_category == NC_LIST) { - col.child_offsets.resize(max_row_offsets[i] + 2, stream); - init_to_zero(col.child_offsets); - } - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_category); - }; - - auto reinitialize_as_string = [&](auto i, auto& col) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = json_col_t::StringColumn; - // destroy references of all child columns after this step, by calling remove_child_columns - }; - - path_from_tree tree_path{column_categories, - column_parent_ids, - column_names, - is_array_of_arrays, - row_array_parent_col_id}; - - // 2. generate nested columns tree and its device_memory - // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. - auto h_range_col_id_it = - thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<0>(a) < thrust::get<0>(b); - }); - - // use hash map because we may skip field name's col_ids - hashmap_of_device_columns columns; - // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking - std::map, NodeIndexT> mapped_columns; - // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); - std::fill(ignore_vals.begin(), ignore_vals.end(), false); - std::vector is_mixed_type_column(num_columns, 0); - std::vector is_pruned(num_columns, 0); - // for columns that are not mixed type but have been forced as string - std::vector forced_as_string_column(num_columns); - columns.try_emplace(parent_node_sentinel, std::ref(root)); - - std::function remove_child_columns = - [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto const& col_name : col.column_order) { - auto child_id = mapped_columns[{this_col_id, col_name}]; - is_mixed_type_column[child_id] = 1; - remove_child_columns(child_id, col.child_columns.at(col_name)); - mapped_columns.erase({this_col_id, col_name}); - columns.erase(child_id); - } - col.child_columns.clear(); // their references are deleted above. - col.column_order.clear(); - }; - - auto name_and_parent_index = [&is_array_of_arrays, - &row_array_parent_col_id, - &column_parent_ids, - &column_categories, - &column_names](auto this_col_id) { - std::string name = ""; - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } else { - CUDF_FAIL("Unexpected parent column category"); - } - return std::pair{name, parent_col_id}; - }; - - // Prune columns that are not required to be parsed. - if (options.is_enabled_prune_columns()) { - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // get path of this column, and get its dtype if present in options - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { - is_pruned[this_col_id] = 1; - continue; - } else { - // make sure all its parents are not pruned. - while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { - is_pruned[parent_col_id] = 0; - parent_col_id = column_parent_ids[parent_col_id]; - } - } - } - } - - // Build the column tree, also, handles mixed types. - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - - // if parent is mixed type column or this column is pruned or if parent - // has been forced as string, ignore this column. - if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || - forced_as_string_column[parent_col_id]) { - ignore_vals[this_col_id] = true; - if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } - if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } - continue; - } - - // If the child is already found, - // replace if this column is a nested column and the existing was a value column - // ignore this column if this column is a value column and the existing was a nested column - auto it = columns.find(parent_col_id); - CUDF_EXPECTS(it != columns.end(), "Parent column not found"); - auto& parent_col = it->second.get(); - bool replaced = false; - if (mapped_columns.count({parent_col_id, name}) > 0) { - auto const old_col_id = mapped_columns[{parent_col_id, name}]; - // If mixed type as string is enabled, make both of them strings and merge them. - // All child columns will be ignored when parsing. - if (is_enabled_mixed_types_as_string) { - bool const is_mixed_type = [&]() { - // If new or old is STR and they are all not null, make it mixed type, else ignore. - if (column_categories[this_col_id] == NC_VAL || - column_categories[this_col_id] == NC_STR) { - if (is_str_column_all_nulls[this_col_id]) return false; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - if (is_str_column_all_nulls[old_col_id]) return false; - } - return true; - }(); - if (is_mixed_type) { - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - remove_child_columns(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } - col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } - } - - if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = true; - continue; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - // remap - ignore_vals[old_col_id] = true; - mapped_columns.erase({parent_col_id, name}); - columns.erase(old_col_id); - parent_col.child_columns.erase(name); - replaced = true; // to skip duplicate name in column_order - } else { - // If this is a nested column but we're trying to insert either (a) a list node into a - // struct column or (b) a struct node into a list column, we fail - CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and - column_categories[this_col_id] == NC_STRUCT) or - (column_categories[old_col_id] == NC_STRUCT and - column_categories[this_col_id] == NC_LIST)), - "A mix of lists and structs within the same column is not supported"); - } - } - - auto this_column_category = column_categories[this_col_id]; - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - this_column_category = NC_STR; - } - - CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); - // move into parent - device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col, this_column_category); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - col.forced_as_string_column = true; - forced_as_string_column[this_col_id] = true; - } - - auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; - CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); - if (not replaced) parent_col.column_order.push_back(name); - columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); - mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); - } - - if (is_enabled_mixed_types_as_string) { - // ignore all children of mixed type columns - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = true; - columns.erase(this_col_id); - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and - is_mixed_type_column[this_col_id] == 1) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); - } - - // ignore all children of columns forced as string - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { - forced_as_string_column[this_col_id] = true; - ignore_vals[this_col_id] = true; - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and - forced_as_string_column[this_col_id]) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); - - // restore unique_col_ids order - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<1>(a) < thrust::get<1>(b); - }); - return {ignore_vals, columns}; -} - -void scatter_offsets(tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t const& d_column_tree, - host_span ignore_vals, - hashmap_of_device_columns const& columns, - rmm::cuda_stream_view stream) -{ - auto const num_nodes = col_ids.size(); - auto const num_columns = d_column_tree.node_categories.size(); - // move columns data to device. - auto columns_data = cudf::detail::make_host_vector(num_columns, stream); - for (auto& [col_id, col_ref] : columns) { - if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - columns_data[col_id] = json_column_data{col.string_offsets.data(), - col.string_lengths.data(), - col.child_offsets.data(), - static_cast(col.validity.data())}; - } - - auto d_ignore_vals = cudf::detail::make_device_uvector_async( - ignore_vals, stream, cudf::get_current_device_resource_ref()); - auto d_columns_data = cudf::detail::make_device_uvector_async( - columns_data, stream, cudf::get_current_device_resource_ref()); - - // 3. scatter string offsets to respective columns, set validity bits - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - num_nodes, - [column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - if (d_ignore_vals[col_ids[i]]) return; - auto const node_category = column_categories[col_ids[i]]; - switch (node_category) { - case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_STR: [[fallthrough]]; - case NC_VAL: - if (d_ignore_vals[col_ids[i]]) break; - set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); - d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; - d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; - break; - default: break; - } - }); - - // 4. scatter List offset - // copy_if only node's whose parent is list, (node_id, parent_col_id) - // stable_sort by parent_col_id of {node_id}. - // For all unique parent_node_id of (i==0, i-1!=i), write start offset. - // (i==last, i+1!=i), write end offset. - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids - auto parent_col_id = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [col_ids = col_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { - return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_ids[node_id]]; - })); - auto const list_children_end = thrust::copy_if( - rmm::exec_policy_nosync(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + - num_nodes, - thrust::make_counting_iterator(0), - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST and - (!d_ignore_vals[col_ids[parent_node_id]]); - }); - - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), - parent_col_ids.begin(), - parent_col_ids.begin() + num_list_children, - node_ids.begin()); + auto parse_opt = parsing_options(options, stream); thrust::for_each_n( rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - num_list_children, - [node_ids = node_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - parent_col_ids = parent_col_ids.begin(), - row_offsets = row_offsets.begin(), - d_columns_data = d_columns_data.begin(), - num_list_children] __device__(size_type i) { - auto const node_id = node_ids[i]; - auto const parent_node_id = parent_node_ids[node_id]; - // scatter to list_offset - if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = - row_offsets[node_id]; - } - // last value of list child_offset is its size. - if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = - row_offsets[node_id] + 1; + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; } }); + return is_all_nulls; +} - // 5. scan on offsets. - for (auto& [id, col_ref] : columns) { - auto& col = col_ref.get(); - if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.string_offsets.begin(), - col.string_offsets.end(), - col.string_offsets.begin(), - thrust::maximum{}); - } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.child_offsets.begin(), - col.child_offsets.end(), - col.child_offsets.begin(), - thrust::maximum{}); - } - } - stream.synchronize(); +NodeIndexT get_row_array_parent_col_id(device_span col_ids, + bool is_enabled_lines, + rmm::cuda_stream_view stream) +{ + if (col_ids.empty()) { return parent_node_sentinel; } + + auto const list_node_index = is_enabled_lines ? 0 : 1; + auto const value = cudf::detail::make_host_vector_sync( + device_span{col_ids.data() + list_node_index, 1}, stream); + + return value[0]; } +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + +using hashmap_of_device_columns = + std::unordered_map>; + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); -namespace experimental { +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream); std::map unified_schema(cudf::io::json_reader_options const& options) { @@ -832,23 +269,11 @@ std::map unified_schema(cudf::io::json_reader_optio }); return dnew; }, - [](std::map const& user_dtypes) { return user_dtypes; }}, + [](std::map const& user_dtypes) { return user_dtypes; }, + [](schema_element const& user_dtypes) { return user_dtypes.child_types; }}, options.get_dtypes()); } -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - /** * @brief Constructs `d_json_column` from node tree representation * Newly constructed columns are inserted into `root`'s children. @@ -1040,7 +465,7 @@ std::pair, hashmap_of_device_columns> build_tree std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns()); // prune all children of a column, but not self. - auto ignore_all_children = [&](auto parent_col_id) { + auto ignore_all_children = [&adj, &is_pruned](auto parent_col_id) { std::deque offspring; if (adj.count(parent_col_id)) { for (auto const& child : adj[parent_col_id]) { @@ -1068,7 +493,7 @@ std::pair, hashmap_of_device_columns> build_tree auto expected_types = cudf::detail::make_host_vector(num_columns, stream); std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); - auto lookup_names = [&column_names](auto child_ids, auto name) { + auto lookup_names = [&column_names](auto const& child_ids, auto const& name) { for (auto const& child_id : child_ids) { if (column_names[child_id] == name) return child_id; } @@ -1145,7 +570,7 @@ std::pair, hashmap_of_device_columns> build_tree for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size(); i++) { NodeIndexT const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; auto value_id = std::stol(name); if (value_id >= 0 and value_id < static_cast(user_dtypes.size())) mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]}); @@ -1156,7 +581,7 @@ std::pair, hashmap_of_device_columns> build_tree std::map const& user_dtypes) -> void { for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { auto const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; if (user_dtypes.count(name)) mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)}); } @@ -1165,10 +590,19 @@ std::pair, hashmap_of_device_columns> build_tree std::map const& user_dtypes) -> void { for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { auto const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; if (user_dtypes.count(name)) mark_is_pruned(first_child_id, user_dtypes.at(name)); } + }, + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + schema_element const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { + auto const first_child_id = adj[root_list_col_id][i]; + auto const& name = column_names[first_child_id]; + if (user_dtypes.child_types.count(name) != 0) + mark_is_pruned(first_child_id, user_dtypes.child_types.at(name)); + } }}, options.get_dtypes()); } else { @@ -1202,7 +636,9 @@ std::pair, hashmap_of_device_columns> build_tree [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema]( std::map const& user_dtypes) -> void { mark_is_pruned(root_struct_col_id, u_schema); - }}, + }, + [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](schema_element const& user_dtypes) + -> void { mark_is_pruned(root_struct_col_id, u_schema); }}, options.get_dtypes()); } // Useful for array of arrays @@ -1290,7 +726,7 @@ std::pair, hashmap_of_device_columns> build_tree if (expected_category == NC_STRUCT) { // find field column ids, and its children and create columns. for (auto const& field_id : child_ids) { - auto name = column_names[field_id]; + auto const& name = column_names[field_id]; if (is_pruned[field_id]) continue; auto inserted = ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second; @@ -1321,7 +757,7 @@ std::pair, hashmap_of_device_columns> build_tree std::map> array_values; for (auto const& child_id : child_ids) { if (is_pruned[child_id]) continue; - auto name = column_names[child_id]; + auto const& name = column_names[child_id]; array_values[std::stoi(name)].push_back(child_id); } // @@ -1391,14 +827,149 @@ std::pair, hashmap_of_device_columns> build_tree column_categories.cbegin(), expected_types.begin(), [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; }); - cudaMemcpyAsync(d_column_tree.node_categories.begin(), - expected_types.data(), - expected_types.size() * sizeof(column_categories[0]), - cudaMemcpyDefault, - stream.value()); + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, expected_types, stream); return {is_pruned, columns}; } -} // namespace experimental + +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_columns = d_column_tree.node_categories.size(); + // move columns data to device. + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + static_cast(col.validity.data())}; + } + + auto d_ignore_vals = cudf::detail::make_device_uvector_async( + ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_columns_data = cudf::detail::make_device_uvector_async( + columns_data, stream, cudf::get_current_device_resource_ref()); + + // 3. scatter string offsets to respective columns, set validity bits + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + num_nodes, + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_STR: [[fallthrough]]; + case NC_VAL: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + })); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } + stream.synchronize(); +} } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 912e93d52ae..30a154fdda2 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -399,9 +399,9 @@ std::pair, std::vector> device_json_co // - String columns will be returned as nullable, iff there's at least one null entry if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } - // For string columns return ["offsets", "char"] schema + // For string columns return ["offsets"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema return {std::move(col), std::vector{}}; @@ -410,12 +410,37 @@ std::pair, std::vector> device_json_co std::vector> child_columns; std::vector column_names{}; size_type num_rows{json_col.num_rows}; + + bool const has_column_order = + prune_columns and not schema.value_or(schema_element{}) + .column_order.value_or(std::vector{}) + .empty(); + + auto const& col_order = + has_column_order ? schema.value().column_order.value() : json_col.column_order; + // Create children columns - for (auto const& col_name : json_col.column_order) { - auto const& col = json_col.child_columns.find(col_name); - column_names.emplace_back(col->first); - auto& child_col = col->second; + for (auto const& col_name : col_order) { auto child_schema_element = get_child_schema(col_name); + auto const found_it = json_col.child_columns.find(col_name); + + if (prune_columns and found_it == std::end(json_col.child_columns)) { + CUDF_EXPECTS(child_schema_element.has_value(), + "Column name not found in input schema map, but present in column order and " + "prune_columns is enabled"); + column_names.emplace_back(make_column_name_info( + child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), col_name)); + auto all_null_column = make_all_nulls_column( + child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), + num_rows, + stream, + mr); + child_columns.emplace_back(std::move(all_null_column)); + continue; + } + column_names.emplace_back(found_it->first); + + auto& child_col = found_it->second; if (!prune_columns or child_schema_element.has_value()) { auto [child_column, names] = device_json_column_to_cudf_column( child_col, d_input, options, prune_columns, child_schema_element, stream, mr); @@ -485,16 +510,6 @@ std::pair, std::vector> device_json_co } } -template -auto make_device_json_column_dispatch(bool experimental, Args&&... args) -{ - if (experimental) { - return experimental::make_device_json_column(std::forward(args)...); - } else { - return make_device_json_column(std::forward(args)...); - } -} - table_with_metadata device_parse_nested_json(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, @@ -523,16 +538,14 @@ table_with_metadata device_parse_nested_json(device_span d_input, #endif bool const is_array_of_arrays = [&]() { - std::array h_node_categories = {NC_ERR, NC_ERR}; - auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(), - gpu_tree.node_categories.data(), - sizeof(node_t) * size_to_copy, - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size()); + if (size_to_copy == 0) return false; + auto const h_node_categories = cudf::detail::make_host_vector_sync( + device_span{gpu_tree.node_categories.data(), size_to_copy}, stream); + if (options.is_enabled_lines()) return h_node_categories[0] == NC_LIST; - return h_node_categories[0] == NC_LIST and h_node_categories[1] == NC_LIST; + return h_node_categories.size() >= 2 and h_node_categories[0] == NC_LIST and + h_node_categories[1] == NC_LIST; }(); auto [gpu_col_id, gpu_row_offsets] = @@ -553,16 +566,15 @@ table_with_metadata device_parse_nested_json(device_span d_input, 0); // Get internal JSON column - make_device_json_column_dispatch(options.is_enabled_experimental(), - d_input, - gpu_tree, - gpu_col_id, - gpu_row_offsets, - root_column, - is_array_of_arrays, - options, - stream, - mr); + make_device_json_column(d_input, + gpu_tree, + gpu_col_id, + gpu_row_offsets, + root_column, + is_array_of_arrays, + options, + stream, + mr); // data_root refers to the root column of the data represented by the given JSON string auto& data_root = @@ -589,11 +601,21 @@ table_with_metadata device_parse_nested_json(device_span d_input, std::vector out_column_names; auto parse_opt = parsing_options(options, stream); - // Iterate over the struct's child columns and convert to cudf column - size_type column_index = 0; - for (auto const& col_name : root_struct_col.column_order) { - auto& json_col = root_struct_col.child_columns.find(col_name)->second; + schema_element const* prune_schema = std::get_if(&options.get_dtypes()); + bool const has_column_order = options.is_enabled_prune_columns() and prune_schema != nullptr and + prune_schema->column_order.has_value() and + not prune_schema->column_order->empty(); + auto const& col_order = + has_column_order ? prune_schema->column_order.value() : root_struct_col.column_order; + if (has_column_order) { + CUDF_EXPECTS(prune_schema->child_types.size() == col_order.size(), + "Input schema column order size mismatch with input schema child types"); + } + auto root_col_size = root_struct_col.num_rows; + // Iterate over the struct's child columns/column_order and convert to cudf column + size_type column_index = 0; + for (auto const& col_name : col_order) { std::optional child_schema_element = std::visit( cudf::detail::visitor_overload{ [column_index](std::vector const& user_dtypes) -> std::optional { @@ -603,17 +625,23 @@ table_with_metadata device_parse_nested_json(device_span d_input, }, [col_name]( std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; + if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) + return std::optional{{it->second}}; + return std::nullopt; }, [col_name](std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; + if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) return it->second; + return std::nullopt; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + if (auto it = user_dtypes.child_types.find(col_name); + it != std::end(user_dtypes.child_types)) + return it->second; + return std::nullopt; }}, options.get_dtypes()); + #ifdef NJP_DEBUG_PRINT auto debug_schema_print = [](auto ret) { std::cout << ", type id: " @@ -621,20 +649,39 @@ table_with_metadata device_parse_nested_json(device_span d_input, << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" << "\n"; }; - std::visit( - cudf::detail::visitor_overload{[column_index](std::vector const&) { - std::cout << "Column by index: #" << column_index; - }, - [col_name](std::map const&) { - std::cout << "Column by flat name: '" << col_name; - }, - [col_name](std::map const&) { - std::cout << "Column by nested name: #" << col_name; - }}, - options.get_dtypes()); + std::visit(cudf::detail::visitor_overload{ + [column_index](std::vector const&) { + std::cout << "Column by index: #" << column_index; + }, + [col_name](std::map const&) { + std::cout << "Column by flat name: '" << col_name; + }, + [col_name](std::map const&) { + std::cout << "Column by nested name: #" << col_name; + }, + [col_name](schema_element const&) { + std::cout << "Column by nested schema with column order: #" << col_name; + }}, + options.get_dtypes()); debug_schema_print(child_schema_element); #endif + auto const found_it = root_struct_col.child_columns.find(col_name); + if (options.is_enabled_prune_columns() and + found_it == std::end(root_struct_col.child_columns)) { + CUDF_EXPECTS(child_schema_element.has_value(), + "Column name not found in input schema map, but present in column order and " + "prune_columns is enabled"); + // inserts all null column + out_column_names.emplace_back(make_column_name_info(child_schema_element.value(), col_name)); + auto all_null_column = + make_all_nulls_column(child_schema_element.value(), root_col_size, stream, mr); + out_columns.emplace_back(std::move(all_null_column)); + column_index++; + continue; + } + auto& json_col = found_it->second; + if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) { // Get this JSON column's cudf column and schema info, (modifies json_col) auto [cudf_col, col_name_info] = diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 2d435dc8e1a..1b61be20202 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -16,6 +16,7 @@ #include "io/fst/lookup_tables.cuh" +#include #include #include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include @@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " SINGLE_QUOTE_CHAR, ///< Quote character SG: ' ESCAPE_CHAR, ///< Escape character SG: '\' - NEWLINE_CHAR, ///< Newline character SG: '\n' + DELIM_CHAR, ///< Delimiter character SG OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; @@ -72,13 +72,17 @@ constexpr auto TT_SEC = dfa_states::TT_SEC; constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ - {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; +auto get_sgid_lut(SymbolT delim) +{ + // The i-th string representing all the characters of a symbol group + std::array, NUM_SYMBOL_GROUPS - 1> symbol_groups{ + {{'\"'}, {'\''}, {'\\'}, {delim}}}; + return symbol_groups; +} // Transition table std::array, TT_NUM_STATES> const qna_state_tt{{ - /* IN_STATE " ' \ \n OTHER */ + /* IN_STATE " ' \ OTHER */ /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, @@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes { namespace normalize_whitespace { +// We do not need a symbol group for the delimiter character since whitespace normalization +// now occurs after tokenization. enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " ESCAPE_CHAR, ///< Escape character SG: '\\' - NEWLINE_CHAR, ///< Newline character SG: '\n' WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; // Alias for readability of symbol group ids constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ - {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; + +std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}}; /** * -------- FST states --------- * ----------------------------- * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double - * | quotes as well as any other character not enclosed by a string. Also handles - * | newline character present within a string - * TT_DQS | Double-quoted string state handling all characters within double quotes except - * | newline character + * | quotes as well as any other character not enclosed by a string. + * TT_DQS | Double-quoted string state handling all characters within double quotes * TT_DEC | State handling escaped characters inside double-quoted string. Note that this * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. @@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); // Transition table std::array, TT_NUM_STATES> const wna_state_tt{ - {/* IN_STATE " \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; + {/* IN_STATE " \ OTHER */ + /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; // The DFA's starting state constexpr StateT start_state = static_cast(TT_OOS); @@ -302,21 +304,22 @@ struct TransduceToNormalizedWS { namespace detail { void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; - auto parser = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs), - fst::detail::make_transition_table(normalize_quotes::qna_state_tt), - fst::detail::make_translation_functor( - normalize_quotes::TransduceToNormalizedQuotes{}), - stream); + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)), + fst::detail::make_transition_table(normalize_quotes::qna_state_tt), + fst::detail::make_translation_functor( + normalize_quotes::TransduceToNormalizedQuotes{}), + stream); rmm::device_buffer outbuf(indata.size() * 2, stream, mr); - rmm::device_scalar outbuf_size(stream, mr); + cudf::detail::device_scalar outbuf_size(stream, mr); parser.Transduce(reinterpret_cast(indata.data()), static_cast(indata.size()), static_cast(outbuf.data()), @@ -401,7 +404,7 @@ std:: stream); rmm::device_uvector outbuf_indices(inbuf.size(), stream, mr); - rmm::device_scalar outbuf_indices_size(stream, mr); + cudf::detail::device_scalar outbuf_indices_size(stream, mr); parser.Transduce(inbuf.data(), static_cast(inbuf.size()), thrust::make_discard_iterator(), diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index d949635c1cc..e2fe926ea19 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -264,16 +264,13 @@ tree_meta_t get_tree_representation(device_span tokens, error_count > 0) { auto const error_location = thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); - SymbolOffsetT error_index; - CUDF_CUDA_TRY( - cudaMemcpyAsync(&error_index, - token_indices.data() + thrust::distance(tokens.begin(), error_location), - sizeof(SymbolOffsetT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + auto error_index = cudf::detail::make_host_vector_sync( + device_span{ + token_indices.data() + thrust::distance(tokens.begin(), error_location), 1}, + stream); + CUDF_FAIL("JSON Parser encountered an invalid format at location " + - std::to_string(error_index)); + std::to_string(error_index[0])); } auto const num_tokens = tokens.size(); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 3d9a51833e0..4989fff4b30 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -19,10 +19,7 @@ #include #include #include -#include -#include #include -#include #include #include @@ -405,21 +402,6 @@ void make_device_json_column(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); -namespace experimental { -/** - * @copydoc cudf::io::json::detail::make_device_json_column - */ -void make_device_json_column(device_span input, - tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -} // namespace experimental - /** * @brief Retrieves the parse_options to be used for type inference and type casting * @@ -447,6 +429,29 @@ table_with_metadata device_parse_nested_json(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Create all null column of a given nested schema + * + * @param schema The schema of the column to create + * @param num_rows The number of rows in the column + * @param stream The CUDA stream to which kernels are dispatched + * @param mr resource with which to allocate + * @return The all null column + */ +std::unique_ptr make_all_nulls_column(schema_element const& schema, + size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * @brief Create metadata for a column of a given schema + * + * @param schema The schema of the column + * @param col_name The name of the column + * @return column metadata for a given schema + */ +column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name); + /** * @brief Get the path data type of a column by path if present in input schema * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 76816071d8c..f1c2826c62a 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -21,6 +21,7 @@ #include "nested_json.hpp" #include +#include #include #include #include @@ -34,7 +35,6 @@ #include #include -#include #include #include @@ -83,8 +83,7 @@ struct tree_node { void check_input_size(std::size_t input_size) { // Transduce() writes symbol offsets that may be as large input_size-1 - CUDF_EXPECTS(input_size == 0 || - (input_size - 1) <= std::numeric_limits::max(), + CUDF_EXPECTS(input_size == 0 || (input_size - 1) <= std::numeric_limits::max(), "Given JSON input is too large"); } } // namespace @@ -1447,11 +1446,7 @@ void get_stack_context(device_span json_in, constexpr StackSymbolT read_symbol = 'x'; // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) - rmm::device_scalar d_num_stack_ops(stream); - - // Sequence of stack symbols and their position in the original input (sparse representation) - rmm::device_uvector stack_ops{json_in.size(), stream}; - rmm::device_uvector stack_op_indices{json_in.size(), stream}; + cudf::detail::device_scalar d_num_stack_ops(stream); // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes constexpr auto max_translation_table_size = @@ -1469,11 +1464,26 @@ void get_stack_context(device_span json_in, // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end // of structs/lists + // Run FST to estimate the sizes of translated buffers + json_to_stack_ops_fst.Transduce(json_in.begin(), + static_cast(json_in.size()), + thrust::make_discard_iterator(), + thrust::make_discard_iterator(), + d_num_stack_ops.data(), + to_stack_op::start_state, + stream); + + auto stack_ops_bufsize = d_num_stack_ops.value(stream); + // Sequence of stack symbols and their position in the original input (sparse representation) + rmm::device_uvector stack_ops{stack_ops_bufsize, stream}; + rmm::device_uvector stack_op_indices{stack_ops_bufsize, stream}; + + // Run bracket-brace FST to retrieve starting positions of structs and lists json_to_stack_ops_fst.Transduce(json_in.begin(), static_cast(json_in.size()), stack_ops.data(), stack_op_indices.data(), - d_num_stack_ops.data(), + thrust::make_discard_iterator(), to_stack_op::start_state, stream); @@ -1509,6 +1519,7 @@ std::pair, rmm::device_uvector> pr device_span token_indices, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); // Instantiate FST for post-processing the token stream to remove all tokens that belong to an // invalid JSON line token_filter::UnwrapTokenFromSymbolOp sgid_op{}; @@ -1520,7 +1531,7 @@ std::pair, rmm::device_uvector> pr stream); auto const mr = cudf::get_current_device_resource_ref(); - rmm::device_scalar d_num_selected_tokens(stream, mr); + cudf::detail::device_scalar d_num_selected_tokens(stream, mr); rmm::device_uvector filtered_tokens_out{tokens.size(), stream, mr}; rmm::device_uvector filtered_token_indices_out{tokens.size(), stream, mr}; @@ -1639,26 +1650,33 @@ std::pair, rmm::device_uvector> ge std::size_t constexpr max_tokens_per_struct = 6; auto const max_token_out_count = cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct; - rmm::device_scalar num_written_tokens{stream}; + cudf::detail::device_scalar num_written_tokens{stream}; // In case we're recovering on invalid JSON lines, post-processing the token stream requires to // see a JSON-line delimiter as the very first item SymbolOffsetT const delimiter_offset = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0); - rmm::device_uvector tokens{max_token_out_count + delimiter_offset, stream, mr}; - rmm::device_uvector tokens_indices{ - max_token_out_count + delimiter_offset, stream, mr}; + // Run FST to estimate the size of output buffers json_to_tokens_fst.Transduce(zip_in, static_cast(json_in.size()), - tokens.data() + delimiter_offset, - tokens_indices.data() + delimiter_offset, + thrust::make_discard_iterator(), + thrust::make_discard_iterator(), num_written_tokens.data(), tokenizer_pda::start_state, stream); auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset; - tokens.resize(num_total_tokens, stream); - tokens_indices.resize(num_total_tokens, stream); + rmm::device_uvector tokens{num_total_tokens, stream, mr}; + rmm::device_uvector tokens_indices{num_total_tokens, stream, mr}; + + // Run FST to translate the input JSON string into tokens and indices at which they occur + json_to_tokens_fst.Transduce(zip_in, + static_cast(json_in.size()), + tokens.data() + delimiter_offset, + tokens_indices.data() + delimiter_offset, + thrust::make_discard_iterator(), + tokenizer_pda::start_state, + stream); if (delimiter_offset == 1) { tokens.set_element(0, token_t::LineEnd, stream); @@ -2180,9 +2198,9 @@ std::pair, std::vector> json_column_to // - String columns will be returned as nullable, iff there's at least one null entry if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } - // For string columns return ["offsets", "char"] schema + // For string columns return ["offsets"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema else { diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 4caa5cd9e24..401a6e992de 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -16,14 +16,201 @@ #include "nested_json.hpp" +#include +#include +#include #include +#include +#include +#include +#include +#include #include #include #include +namespace cudf::io { +namespace { +bool validate_column_order(schema_element const& types) +{ + // For struct types, check if column_order size matches child_types size and all elements in + // column_order are in child_types, in child_types, call this function recursively. + // For list types, check if child_types size is 1 and call this function recursively. + if (types.type.id() == type_id::STRUCT) { + if (types.column_order.has_value()) { + if (types.column_order.value().size() != types.child_types.size()) { return false; } + for (auto const& column_name : types.column_order.value()) { + auto it = types.child_types.find(column_name); + if (it == types.child_types.end()) { return false; } + if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) { + if (!validate_column_order(it->second)) { return false; } + } + } + } + } else if (types.type.id() == type_id::LIST) { + if (types.child_types.size() != 1) { return false; } + auto it = types.child_types.begin(); + if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) { + if (!validate_column_order(it->second)) { return false; } + } + } + return true; +} +} // namespace + +void json_reader_options::set_dtypes(schema_element types) +{ + CUDF_EXPECTS( + validate_column_order(types), "Column order does not match child types", std::invalid_argument); + _dtypes = std::move(types); +} +} // namespace cudf::io + namespace cudf::io::json::detail { +/// Created an empty column of the specified schema +struct empty_column_functor { + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + template ())> + std::unique_ptr operator()(schema_element const& schema) const + { + return make_empty_column(schema.type); + } + + template )> + std::unique_ptr operator()(schema_element const& schema) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name)); + auto offsets = make_empty_column(data_type(type_to_id())); + return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema) const + { + std::vector> child_columns; + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + child_columns.push_back(cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name))); + } + return make_structs_column(0, std::move(child_columns), 0, {}, stream, mr); + } +}; + +/// Created all null column of the specified schema +struct allnull_column_functor { + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + private: + auto make_zeroed_offsets(size_type size) const + { + auto offsets_buff = + cudf::detail::make_zeroed_device_uvector_async(size + 1, stream, mr); + return std::make_unique(std::move(offsets_buff), rmm::device_buffer{}, 0); + } + + public: + template ())> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); + } + + template ())> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, + empty_column_functor{stream, mr}, + schema.child_types.at(child_name)); + return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); + auto indices = make_zeroed_offsets(size - 1); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_dictionary_column( + std::move(child), std::move(indices), std::move(null_mask), size, stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + auto offsets = make_zeroed_offsets(size); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_strings_column( + size, std::move(offsets), rmm::device_buffer{}, size, std::move(null_mask)); + } + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, + empty_column_functor{stream, mr}, + schema.child_types.at(child_name)); + auto offsets = make_zeroed_offsets(size); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_lists_column( + size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + std::vector> child_columns; + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + child_columns.push_back(cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size)); + } + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_structs_column( + size, std::move(child_columns), size, std::move(null_mask), stream, mr); + } +}; + +std::unique_ptr make_all_nulls_column(schema_element const& schema, + size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return cudf::type_dispatcher(schema.type, allnull_column_functor{stream, mr}, schema, num_rows); +} + +column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name) +{ + column_name_info info; + info.name = col_name; + switch (schema.type.id()) { + case type_id::STRUCT: + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + info.children.push_back( + make_column_name_info(schema.child_types.at(child_name), child_name)); + } + break; + case type_id::LIST: + info.children.emplace_back("offsets"); + for (auto const& [child_name, child_schema] : schema.child_types) { + info.children.push_back(make_column_name_info(child_schema, child_name)); + } + break; + case type_id::DICTIONARY32: + info.children.emplace_back("indices"); + for (auto const& [child_name, child_schema] : schema.child_types) { + info.children.push_back(make_column_name_info(child_schema, child_name)); + } + break; + case type_id::STRING: info.children.emplace_back("offsets"); break; + default: break; + } + return info; +} + std::optional child_schema_element(std::string const& col_name, cudf::io::json_reader_options const& options) { @@ -46,6 +233,11 @@ std::optional child_schema_element(std::string const& col_name, return (user_dtypes.find(col_name) != std::end(user_dtypes)) ? user_dtypes.find(col_name)->second : std::optional{}; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) + ? user_dtypes.child_types.find(col_name)->second + : std::optional{}; }}, options.get_dtypes()); } diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 83c7b663980..d41d137a2c9 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -87,13 +88,25 @@ void validate_token_stream(device_span d_input, { CUDF_FUNC_RANGE(); if (!options.is_strict_validation()) { return; } + + rmm::device_uvector d_invalid = cudf::detail::make_zeroed_device_uvector_async( + tokens.size(), stream, cudf::get_current_device_resource_ref()); + using token_t = cudf::io::json::token_t; - cudf::detail::optional_trie trie_na = - cudf::detail::create_serialized_trie(options.get_na_values(), stream); - auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto literals = options.get_na_values(); + literals.emplace_back("null"); // added these too to single trie + literals.emplace_back("true"); + literals.emplace_back("false"); + + cudf::detail::optional_trie trie_literals = + cudf::detail::create_serialized_trie(literals, stream); + cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie( + {"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream); + auto validate_values = cuda::proclaim_return_type( [data = d_input.data(), - trie_na = trie_na_view, + trie_literals = cudf::detail::make_trie_view(trie_literals), + trie_nonnumeric = cudf::detail::make_trie_view(trie_nonnumeric), allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), allow_nonnumeric = options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, @@ -101,24 +114,15 @@ void validate_token_stream(device_span d_input, // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - auto c = data[start]; - auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); - if (is_null_literal) { - return true; - } else if ('n' == c) { - return substr_eq(data, start, end, 4, "null"); - } else if ('t' == c) { - return substr_eq(data, start, end, 4, "true"); - } else if ('f' == c) { - return substr_eq(data, start, end, 5, "false"); - } else if (allow_nonnumeric && c == 'N') { - return substr_eq(data, start, end, 3, "NaN"); - } else if (allow_nonnumeric && c == 'I') { - return substr_eq(data, start, end, 8, "Infinity"); - } else if (allow_nonnumeric && c == '+') { - return substr_eq(data, start, end, 4, "+INF") || - substr_eq(data, start, end, 9, "+Infinity"); - } else if ('-' == c || c <= '9' && 'c' >= '0') { + auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start}); + if (is_literal) { return true; } + if (allow_nonnumeric) { + auto const is_nonnumeric = + serialized_trie_contains(trie_nonnumeric, {data + start, end - start}); + if (is_nonnumeric) { return true; } + } + auto c = data[start]; + if ('-' == c || c <= '9' && 'c' >= '0') { // number auto num_state = number_state::START; for (auto at = start; at < end; at++) { @@ -140,9 +144,6 @@ void validate_token_stream(device_span d_input, num_state = number_state::LEADING_ZERO; } else if (c >= '1' && c <= '9') { num_state = number_state::WHOLE; - } else if (allow_nonnumeric && 'I' == c) { - return substr_eq(data, start, end, 4, "-INF") || - substr_eq(data, start, end, 9, "-Infinity"); } else { return false; } @@ -273,33 +274,44 @@ void validate_token_stream(device_span d_input, auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); - auto predicate = [tokens = tokens.begin(), - token_indices = token_indices.begin(), - validate_values, - validate_strings] __device__(auto i) -> bool { + auto predicate = cuda::proclaim_return_type([tokens = tokens.begin(), + token_indices = token_indices.begin(), + validate_values, + validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { return !validate_values(token_indices[i - 1], token_indices[i]); } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; - }; + }); + + auto conditional_invalidout_it = + cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type( + [d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void { + if (x) { d_invalid[i] = true; } + })); + thrust::transform(rmm::exec_policy_nosync(stream), + count_it, + count_it + num_tokens, + conditional_invalidout_it, + predicate); using scan_type = write_if::scan_type; auto conditional_write = write_if{tokens.begin(), num_tokens}; auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); - auto transform_op = cuda::proclaim_return_type( - [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { - if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; - return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; - }); - auto binary_op = cuda::proclaim_return_type( + auto binary_op = cuda::proclaim_return_type( [] __device__(scan_type prev, scan_type curr) -> scan_type { auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); - return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); + return {(curr.second ? curr.first : op_result), prev.second | curr.second}; + }); + auto transform_op = cuda::proclaim_return_type( + [d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type { + if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; + return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; }); - thrust::transform_inclusive_scan(rmm::exec_policy(stream), + thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream), count_it, count_it + num_tokens, conditional_output_it, diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 99a5b17bce8..82d8152ca1c 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -20,9 +20,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -41,6 +43,70 @@ namespace cudf::io::json::detail { namespace { +class compressed_host_buffer_source final : public datasource { + public: + explicit compressed_host_buffer_source(std::unique_ptr const& src, + compression_type comptype) + : _comptype{comptype}, _dbuf_ptr{src->host_read(0, src->size())} + { + auto ch_buffer = host_span(reinterpret_cast(_dbuf_ptr->data()), + _dbuf_ptr->size()); + if (comptype == compression_type::GZIP || comptype == compression_type::ZIP || + comptype == compression_type::SNAPPY) { + _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer); + } else { + _decompressed_buffer = cudf::io::detail::decompress(_comptype, ch_buffer); + _decompressed_ch_buffer_size = _decompressed_buffer.size(); + } + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + auto ch_buffer = host_span(reinterpret_cast(_dbuf_ptr->data()), + _dbuf_ptr->size()); + if (_decompressed_buffer.empty()) { + auto decompressed_hbuf = cudf::io::detail::decompress(_comptype, ch_buffer); + auto const count = std::min(size, decompressed_hbuf.size() - offset); + bool partial_read = offset + count < decompressed_hbuf.size(); + if (!partial_read) { + std::memcpy(dst, decompressed_hbuf.data() + offset, count); + return count; + } + _decompressed_buffer = std::move(decompressed_hbuf); + } + auto const count = std::min(size, _decompressed_buffer.size() - offset); + std::memcpy(dst, _decompressed_buffer.data() + offset, count); + return count; + } + + std::unique_ptr host_read(size_t offset, size_t size) override + { + auto ch_buffer = host_span(reinterpret_cast(_dbuf_ptr->data()), + _dbuf_ptr->size()); + if (_decompressed_buffer.empty()) { + auto decompressed_hbuf = cudf::io::detail::decompress(_comptype, ch_buffer); + auto const count = std::min(size, decompressed_hbuf.size() - offset); + bool partial_read = offset + count < decompressed_hbuf.size(); + if (!partial_read) + return std::make_unique>>( + std::move(decompressed_hbuf), decompressed_hbuf.data() + offset, count); + _decompressed_buffer = std::move(decompressed_hbuf); + } + auto const count = std::min(size, _decompressed_buffer.size() - offset); + return std::make_unique(_decompressed_buffer.data() + offset, count); + } + + [[nodiscard]] bool supports_device_read() const override { return false; } + + [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; } + + private: + std::unique_ptr _dbuf_ptr; + compression_type _comptype; + size_t _decompressed_ch_buffer_size; + std::vector _decompressed_buffer; +}; + // Return total size of sources enclosing the passed range std::size_t sources_size(host_span> const sources, std::size_t range_offset, @@ -125,47 +191,41 @@ datasource::owning_buffer get_record_range_raw_input( { CUDF_FUNC_RANGE(); - std::size_t const total_source_size = sources_size(sources, 0, 0); - auto constexpr num_delimiter_chars = 1; - auto const num_extra_delimiters = num_delimiter_chars * (sources.size() - 1); - compression_type const reader_compression = reader_opts.get_compression(); - std::size_t const chunk_offset = reader_opts.get_byte_range_offset(); - std::size_t chunk_size = reader_opts.get_byte_range_size(); + std::size_t const total_source_size = sources_size(sources, 0, 0); + auto constexpr num_delimiter_chars = 1; + auto const delimiter = reader_opts.get_delimiter(); + auto const num_extra_delimiters = num_delimiter_chars * sources.size(); + std::size_t const chunk_offset = reader_opts.get_byte_range_offset(); + std::size_t chunk_size = reader_opts.get_byte_range_size(); CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset, "Invalid offsetting", std::invalid_argument); - auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset; - chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size; + auto should_load_till_last_source = !chunk_size || chunk_size >= total_source_size - chunk_offset; + chunk_size = should_load_till_last_source ? total_source_size - chunk_offset : chunk_size; - int num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced; + int num_subchunks_prealloced = should_load_till_last_source ? 0 : max_subchunks_prealloced; std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size); - // The allocation for single source compressed input is estimated by assuming a ~4:1 - // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea - // of subchunks. - auto constexpr header_size = 4096; std::size_t buffer_size = - reader_compression != compression_type::NONE - ? total_source_size * estimated_compression_ratio + header_size - : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) + - num_extra_delimiters; + std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) + + num_extra_delimiters; rmm::device_buffer buffer(buffer_size, stream); device_span bufspan(reinterpret_cast(buffer.data()), buffer.size()); // Offset within buffer indicating first read position std::int64_t buffer_offset = 0; auto readbufspan = - ingest_raw_input(bufspan, sources, reader_compression, chunk_offset, chunk_size, stream); + ingest_raw_input(bufspan, sources, chunk_offset, chunk_size, delimiter, stream); auto const shift_for_nonzero_offset = std::min(chunk_offset, 1); auto const first_delim_pos = - chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream); + chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, delimiter, stream); if (first_delim_pos == -1) { // return empty owning datasource buffer auto empty_buf = rmm::device_buffer(0, stream); return datasource::owning_buffer(std::move(empty_buf)); - } else if (!should_load_all_sources) { + } else if (!should_load_till_last_source) { // Find next delimiter std::int64_t next_delim_pos = -1; std::size_t next_subchunk_start = chunk_offset + chunk_size; @@ -177,27 +237,25 @@ datasource::owning_buffer get_record_range_raw_input( buffer_offset += readbufspan.size(); readbufspan = ingest_raw_input(bufspan.last(buffer_size - buffer_offset), sources, - reader_compression, next_subchunk_start, size_per_subchunk, + delimiter, stream); - next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset; + next_delim_pos = find_first_delimiter(readbufspan, delimiter, stream) + buffer_offset; next_subchunk_start += size_per_subchunk; } if (next_delim_pos < buffer_offset) { if (next_subchunk_start >= total_source_size) { // If we have reached the end of source list but the source does not terminate with a - // newline character + // delimiter character next_delim_pos = buffer_offset + readbufspan.size(); } else { // Our buffer_size estimate is insufficient to read until the end of the line! We need to // allocate more memory and try again! num_subchunks_prealloced *= 2; - buffer_size = reader_compression != compression_type::NONE - ? 2 * buffer_size - : std::min(total_source_size, - buffer_size + num_subchunks_prealloced * size_per_subchunk) + - num_extra_delimiters; + buffer_size = std::min(total_source_size, + buffer_size + num_subchunks_prealloced * size_per_subchunk) + + num_extra_delimiters; buffer.resize(buffer_size, stream); bufspan = device_span(reinterpret_cast(buffer.data()), buffer.size()); } @@ -209,10 +267,26 @@ datasource::owning_buffer get_record_range_raw_input( reinterpret_cast(buffer.data()) + first_delim_pos + shift_for_nonzero_offset, next_delim_pos - first_delim_pos - shift_for_nonzero_offset); } + + // Add delimiter to end of buffer - possibly adding an empty line to the input buffer - iff we are + // reading till the end of the last source i.e. should_load_till_last_source is true Note that the + // table generated from the JSONL input remains unchanged since empty lines are ignored by the + // parser. + size_t num_chars = readbufspan.size() - first_delim_pos - shift_for_nonzero_offset; + if (num_chars) { + auto last_char = delimiter; + cudf::detail::cuda_memcpy_async( + device_span(reinterpret_cast(buffer.data()), buffer.size()) + .subspan(readbufspan.size(), 1), + host_span(&last_char, 1, false), + stream); + num_chars++; + } + return datasource::owning_buffer( std::move(buffer), reinterpret_cast(buffer.data()) + first_delim_pos + shift_for_nonzero_offset, - readbufspan.size() - first_delim_pos - shift_for_nonzero_offset); + num_chars); } // Helper function to read the current batch using byte range offsets and size @@ -229,7 +303,8 @@ table_with_metadata read_batch(host_span> sources, // If input JSON buffer has single quotes and option to normalize single quotes is enabled, // invoke pre-processing FST if (reader_opts.is_enabled_normalize_single_quotes()) { - normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); + normalize_single_quotes( + bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref()); } auto buffer = @@ -238,111 +313,11 @@ table_with_metadata read_batch(host_span> sources, return device_parse_nested_json(buffer, reader_opts, stream, mr); } -} // anonymous namespace - -device_span ingest_raw_input(device_span buffer, - host_span> sources, - compression_type compression, - std::size_t range_offset, - std::size_t range_size, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - // We append a line delimiter between two files to make sure the last line of file i and the first - // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line - // delimiter. - auto constexpr num_delimiter_chars = 1; - - if (compression == compression_type::NONE) { - auto delimiter_map = cudf::detail::make_empty_host_vector(sources.size(), stream); - std::vector prefsum_source_sizes(sources.size()); - std::vector> h_buffers; - std::size_t bytes_read = 0; - std::transform_inclusive_scan(sources.begin(), - sources.end(), - prefsum_source_sizes.begin(), - std::plus{}, - [](std::unique_ptr const& s) { return s->size(); }); - auto upper = - std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset); - std::size_t start_source = std::distance(prefsum_source_sizes.begin(), upper); - - auto const total_bytes_to_read = - std::min(range_size, prefsum_source_sizes.back() - range_offset); - range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0; - for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; - i++) { - if (sources[i]->is_empty()) continue; - auto data_size = - std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read); - auto destination = reinterpret_cast(buffer.data()) + bytes_read + - (num_delimiter_chars * delimiter_map.size()); - if (sources[i]->is_device_read_preferred(data_size)) { - bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); - } else { - h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); - auto const& h_buffer = h_buffers.back(); - CUDF_CUDA_TRY(cudaMemcpyAsync( - destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value())); - bytes_read += h_buffer->size(); - } - range_offset = 0; - delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size())); - } - // Removing delimiter inserted after last non-empty source is read - if (!delimiter_map.empty()) { delimiter_map.pop_back(); } - - // If this is a multi-file source, we scatter the JSON line delimiters between files - if (sources.size() > 1) { - static_assert(num_delimiter_chars == 1, - "Currently only single-character delimiters are supported"); - auto const delimiter_source = thrust::make_constant_iterator('\n'); - auto const d_delimiter_map = cudf::detail::make_device_uvector_async( - delimiter_map, stream, cudf::get_current_device_resource_ref()); - thrust::scatter(rmm::exec_policy_nosync(stream), - delimiter_source, - delimiter_source + d_delimiter_map.size(), - d_delimiter_map.data(), - buffer.data()); - } - stream.synchronize(); - return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars)); - } - // TODO: allow byte range reading from multiple compressed files. - auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset); - auto hbuffer = std::vector(remaining_bytes_to_read); - // Single read because only a single compressed source is supported - // Reading to host because decompression of a single block is much faster on the CPU - sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data()); - auto uncomp_data = decompress(compression, hbuffer); - CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(), - reinterpret_cast(uncomp_data.data()), - uncomp_data.size() * sizeof(char), - cudaMemcpyHostToDevice, - stream.value())); - stream.synchronize(); - return buffer.first(uncomp_data.size()); -} - -table_with_metadata read_json(host_span> sources, - json_reader_options const& reader_opts, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +table_with_metadata read_json_impl(host_span> sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - - if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) { - CUDF_EXPECTS(reader_opts.is_enabled_lines(), - "Specifying a byte range is supported only for JSON Lines"); - } - - if (sources.size() > 1) { - CUDF_EXPECTS(reader_opts.get_compression() == compression_type::NONE, - "Multiple compressed inputs are not supported"); - CUDF_EXPECTS(reader_opts.is_enabled_lines(), - "Multiple inputs are supported only for JSON Lines format"); - } - /* * The batched JSON reader enforces that the size of each batch is at most INT_MAX * bytes (~2.14GB). Batches are defined to be byte range chunks - characterized by @@ -351,10 +326,16 @@ table_with_metadata read_json(host_span> sources, * JSON inputs. */ std::size_t const total_source_size = sources_size(sources, 0, 0); - std::size_t chunk_offset = reader_opts.get_byte_range_offset(); - std::size_t chunk_size = reader_opts.get_byte_range_size(); - chunk_size = !chunk_size ? total_source_size - chunk_offset - : std::min(chunk_size, total_source_size - chunk_offset); + + // Batching is enabled only for JSONL inputs, not regular JSON files + CUDF_EXPECTS( + reader_opts.is_enabled_lines() || total_source_size < std::numeric_limits::max(), + "Parsing Regular JSON inputs of size greater than INT_MAX bytes is not supported"); + + std::size_t chunk_offset = reader_opts.get_byte_range_offset(); + std::size_t chunk_size = reader_opts.get_byte_range_size(); + chunk_size = !chunk_size ? total_source_size - chunk_offset + : std::min(chunk_size, total_source_size - chunk_offset); std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size); std::size_t const batch_size_upper_bound = get_batch_size_upper_bound(); @@ -436,4 +417,101 @@ table_with_metadata read_json(host_span> sources, {partial_tables[0].metadata.schema_info}}; } +} // anonymous namespace + +device_span ingest_raw_input(device_span buffer, + host_span> sources, + std::size_t range_offset, + std::size_t range_size, + char delimiter, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + // We append a line delimiter between two files to make sure the last line of file i and the first + // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line + // delimiter. + auto constexpr num_delimiter_chars = 1; + + auto delimiter_map = cudf::detail::make_empty_host_vector(sources.size(), stream); + std::vector prefsum_source_sizes(sources.size()); + std::vector> h_buffers; + std::size_t bytes_read = 0; + std::transform_inclusive_scan(sources.begin(), + sources.end(), + prefsum_source_sizes.begin(), + std::plus{}, + [](std::unique_ptr const& s) { return s->size(); }); + auto upper = + std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset); + std::size_t start_source = std::distance(prefsum_source_sizes.begin(), upper); + + auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset); + range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0; + for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) { + if (sources[i]->is_empty()) continue; + auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read); + auto destination = reinterpret_cast(buffer.data()) + bytes_read + + (num_delimiter_chars * delimiter_map.size()); + if (sources[i]->is_device_read_preferred(data_size)) { + bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); + } else { + h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); + auto const& h_buffer = h_buffers.back(); + CUDF_CUDA_TRY(cudaMemcpyAsync( + destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value())); + bytes_read += h_buffer->size(); + } + range_offset = 0; + delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size())); + } + // Removing delimiter inserted after last non-empty source is read + if (!delimiter_map.empty()) { delimiter_map.pop_back(); } + + // If this is a multi-file source, we scatter the JSON line delimiters between files + if (sources.size() > 1 && !delimiter_map.empty()) { + static_assert(num_delimiter_chars == 1, + "Currently only single-character delimiters are supported"); + auto const delimiter_source = thrust::make_constant_iterator(delimiter); + auto const d_delimiter_map = cudf::detail::make_device_uvector_async( + delimiter_map, stream, cudf::get_current_device_resource_ref()); + thrust::scatter(rmm::exec_policy_nosync(stream), + delimiter_source, + delimiter_source + d_delimiter_map.size(), + d_delimiter_map.data(), + buffer.data()); + } + stream.synchronize(); + return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars)); +} + +table_with_metadata read_json(host_span> sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) { + CUDF_EXPECTS(reader_opts.is_enabled_lines(), + "Specifying a byte range is supported only for JSON Lines"); + } + + if (sources.size() > 1) { + CUDF_EXPECTS(reader_opts.is_enabled_lines(), + "Multiple inputs are supported only for JSON Lines format"); + } + + if (reader_opts.get_compression() == compression_type::NONE) + return read_json_impl(sources, reader_opts, stream, mr); + + std::vector> compressed_sources; + for (size_t i = 0; i < sources.size(); i++) { + compressed_sources.emplace_back( + std::make_unique(sources[i], reader_opts.get_compression())); + } + // in read_json_impl, we need the compressed source size to actually be the + // uncompressed source size for correct batching + return read_json_impl(compressed_sources, reader_opts, stream, mr); +} + } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp index 982190eecb5..ac980938522 100644 --- a/cpp/src/io/json/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -32,10 +32,9 @@ namespace CUDF_EXPORT cudf { namespace io::json::detail { // Some magic numbers -constexpr int num_subchunks = 10; // per chunk_size -constexpr size_t min_subchunk_size = 10000; -constexpr int estimated_compression_ratio = 4; -constexpr int max_subchunks_prealloced = 3; +constexpr int num_subchunks = 10; // per chunk_size +constexpr size_t min_subchunk_size = 10000; +constexpr int max_subchunks_prealloced = 3; /** * @brief Read from array of data sources into RMM buffer. The size of the returned device span @@ -45,17 +44,17 @@ constexpr int max_subchunks_prealloced = 3; * * @param buffer Device span buffer to which data is read * @param sources Array of data sources - * @param compression Compression format of source * @param range_offset Number of bytes to skip from source start * @param range_size Number of bytes to read from source + * @param delimiter Delimiter character for JSONL inputs * @param stream CUDA stream used for device memory operations and kernel launches * @returns A subspan of the input device span containing data read */ device_span ingest_raw_input(device_span buffer, host_span> sources, - compression_type compression, size_t range_offset, size_t range_size, + char delimiter, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index dc7199d7ab1..8156258c810 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -19,6 +19,7 @@ * @brief cuDF-IO JSON writer implementation */ +#include "io/comp/comp.hpp" #include "io/csv/durations.hpp" #include "io/utilities/parsing_utils.cuh" #include "lists/utilities.hpp" @@ -170,6 +171,9 @@ struct escape_strings_fn { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + if (column_v.is_empty()) { // empty begets empty + return make_empty_column(type_id::STRING); + } auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); @@ -825,10 +829,10 @@ void write_chunked(data_sink* out_sink, } } -void write_json(data_sink* out_sink, - table_view const& table, - json_writer_options const& options, - rmm::cuda_stream_view stream) +void write_json_uncompressed(data_sink* out_sink, + table_view const& table, + json_writer_options const& options, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); std::vector user_column_names = [&]() { @@ -931,4 +935,24 @@ void write_json(data_sink* out_sink, } } +void write_json(data_sink* out_sink, + table_view const& table, + json_writer_options const& options, + rmm::cuda_stream_view stream) +{ + if (options.get_compression() != compression_type::NONE) { + std::vector hbuf; + auto hbuf_sink_ptr = data_sink::create(&hbuf); + write_json_uncompressed(hbuf_sink_ptr.get(), table, options, stream); + stream.synchronize(); + auto comp_hbuf = cudf::io::detail::compress( + options.get_compression(), + host_span(reinterpret_cast(hbuf.data()), hbuf.size()), + stream); + out_sink->host_write(comp_hbuf.data(), comp_hbuf.size()); + return; + } + write_json_uncompressed(out_sink, table, options, stream); +} + } // namespace cudf::io::json::detail diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5be75350951..0cb5c382631 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -77,20 +77,6 @@ void rowgroup_char_counts(device_2dspan counts, counts, orc_columns, rowgroup_bounds, str_col_indexes); } -template -CUDF_KERNEL void __launch_bounds__(block_size) - initialize_dictionary_hash_maps_kernel(device_span dictionaries) -{ - auto const dict_map = dictionaries[blockIdx.x].map_slots; - auto const t = threadIdx.x; - for (size_type i = 0; i < dict_map.size(); i += block_size) { - if (t + i < dict_map.size()) { - new (&dict_map[t + i].first) map_type::atomic_key_type{KEY_SENTINEL}; - new (&dict_map[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL}; - } - } -} - struct equality_functor { column_device_view const& col; __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const @@ -109,6 +95,9 @@ struct hash_functor { } }; +// Probing scheme to use for the hash map +using probing_scheme_type = cuco::linear_probing; + template CUDF_KERNEL void __launch_bounds__(block_size) populate_dictionary_hash_maps_kernel(device_2dspan dictionaries, @@ -121,26 +110,34 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const& col = columns[dict.column_idx]; // Make a view of the hash map - auto hash_map_mutable = map_type::device_mutable_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); auto const hash_fn = hash_functor{col}; auto const equality_fn = equality_functor{col}; + storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()}; + // Make a view of the hash map. + auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + equality_fn, + probing_scheme_type{hash_fn}, + cuco::thread_scope_block, + storage_ref}; + + // Create a map ref with `cuco::insert` operator + auto has_map_insert_ref = hash_map_ref.rebind_operators(cuco::insert); + auto const start_row = dict.start_row; auto const end_row = dict.start_row + dict.num_rows; size_type entry_count{0}; size_type char_count{0}; + // all threads should loop the same number of times for (thread_index_type cur_row = start_row + t; cur_row - t < end_row; cur_row += block_size) { auto const is_valid = cur_row < end_row and col.is_valid(cur_row); if (is_valid) { // insert element at cur_row to hash map and count successful insertions - auto const is_unique = - hash_map_mutable.insert(std::pair(cur_row, cur_row), hash_fn, equality_fn); + auto const is_unique = has_map_insert_ref.insert(cuco::pair{cur_row, cur_row}); if (is_unique) { ++entry_count; @@ -175,24 +172,23 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (not dict.is_enabled) { return; } auto const t = threadIdx.x; - auto map = map_type::device_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); - __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); + for (size_type i = 0; i < dict.map_slots.size(); i += block_size) { if (t + i < dict.map_slots.size()) { - auto* slot = reinterpret_cast(map.begin_slot() + t + i); - auto key = slot->first; - if (key != KEY_SENTINEL) { - auto loc = counter.fetch_add(1, memory_order_relaxed); - dict.data[loc] = key; - slot->second = loc; + auto window = dict.map_slots.begin() + t + i; + // Collect all slots from each window. + for (auto& slot : *window) { + auto const key = slot.first; + if (key != KEY_SENTINEL) { + auto loc = counter.fetch_add(1, memory_order_relaxed); + dict.data[loc] = key; + slot.second = loc; + } } } } @@ -205,47 +201,42 @@ CUDF_KERNEL void __launch_bounds__(block_size) { auto const col_idx = blockIdx.x; auto const stripe_idx = blockIdx.y; + auto const t = threadIdx.x; auto const& dict = dictionaries[col_idx][stripe_idx]; auto const& col = columns[dict.column_idx]; if (not dict.is_enabled) { return; } - auto const t = threadIdx.x; + // Make a view of the hash map + auto const hash_fn = hash_functor{col}; + auto const equality_fn = equality_functor{col}; + + storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()}; + // Make a view of the hash map. + auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + equality_fn, + probing_scheme_type{hash_fn}, + cuco::thread_scope_block, + storage_ref}; + + // Create a map ref with `cuco::insert` operator + auto has_map_find_ref = hash_map_ref.rebind_operators(cuco::find); + auto const start_row = dict.start_row; auto const end_row = dict.start_row + dict.num_rows; - auto const map = map_type::device_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); - - thread_index_type cur_row = start_row + t; - while (cur_row < end_row) { + for (thread_index_type cur_row = start_row + t; cur_row < end_row; cur_row += block_size) { if (col.is_valid(cur_row)) { - auto const hash_fn = hash_functor{col}; - auto const equality_fn = equality_functor{col}; - auto const found_slot = map.find(cur_row, hash_fn, equality_fn); - cudf_assert(found_slot != map.end() && + auto const found_slot = has_map_find_ref.find(cur_row); + // Fail if we didn't find the previously inserted key. + cudf_assert(found_slot != has_map_find_ref.end() && "Unable to find value in map in dictionary index construction"); - if (found_slot != map.end()) { - // No need for atomic as this is not going to be modified by any other thread - auto const val_ptr = reinterpret_cast(&found_slot->second); - dict.index[cur_row] = *val_ptr; - } + dict.index[cur_row] = found_slot->second; } - cur_row += block_size; } } -void initialize_dictionary_hash_maps(device_2dspan dictionaries, - rmm::cuda_stream_view stream) -{ - if (dictionaries.count() == 0) { return; } - constexpr int block_size = 1024; - initialize_dictionary_hash_maps_kernel - <<>>(dictionaries.flat_view()); -} - void populate_dictionary_hash_maps(device_2dspan dictionaries, device_span columns, rmm::cuda_stream_view stream) diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index 1fe5e5aa41e..7046b3b3f91 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -460,7 +460,7 @@ host_span OrcDecompressor::decompress_blocks(host_span - friend class FunctionSwitchImpl; + friend struct FunctionSwitchImpl; void skip_bytes(size_t bytecnt) { diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 8c7ccf0527f..0949fafe9a4 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -21,6 +21,7 @@ #include "io/utilities/column_buffer.hpp" #include "orc.hpp" +#include #include #include #include @@ -40,19 +41,27 @@ namespace gpu { using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; +using key_type = size_type; +using mapped_type = size_type; +using slot_type = cuco::pair; +auto constexpr map_cg_size = + 1; ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset. + ///< Note: Adjust insert and find loops to use `cg::tile` if increasing this. +auto constexpr window_size = + 1; ///< Number of concurrent slots (set for best performance) handled by each thread. +auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size + ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor. +using storage_type = cuco::aow_storage, + cudf::detail::cuco_allocator>; +using storage_ref_type = typename storage_type::ref_type; +using window_type = typename storage_type::window_type; +using slot_type = cuco::pair; + auto constexpr KEY_SENTINEL = size_type{-1}; auto constexpr VALUE_SENTINEL = size_type{-1}; -using map_type = cuco::legacy::static_map; - -/** - * @brief The alias of `map_type::pair_atomic_type` class. - * - * Declare this struct by trivial subclassing instead of type aliasing so we can have forward - * declaration of this struct somewhere else. - */ -struct slot_type : public map_type::slot_type {}; - struct CompressedStreamInfo { CompressedStreamInfo() = default; explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_) @@ -184,11 +193,11 @@ struct StripeStream { */ struct stripe_dictionary { // input - device_span map_slots; // hash map storage - uint32_t column_idx = 0; // column index - size_type start_row = 0; // first row in the stripe - size_type start_rowgroup = 0; // first rowgroup in the stripe - size_type num_rows = 0; // number of rows in the stripe + device_span map_slots; // hash map (windows) storage + uint32_t column_idx = 0; // column index + size_type start_row = 0; // first row in the stripe + size_type start_rowgroup = 0; // first rowgroup in the stripe + size_type num_rows = 0; // number of rows in the stripe // output device_span data; // index of elements in the column to include in the dictionary diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 01ee5ad177d..fcaee9c548e 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -500,6 +500,8 @@ void reader_impl::load_next_stripe_data(read_mode mode) auto const [read_begin, read_end] = merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range); + bool stream_synchronized{false}; + for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) { auto const& read_info = _file_itm_data.data_read_info[read_idx]; auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source; @@ -507,10 +509,17 @@ void reader_impl::load_next_stripe_data(read_mode mode) lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data()); if (source_ptr->is_device_read_preferred(read_info.length)) { - device_read_tasks.push_back( - std::pair(source_ptr->device_read_async( - read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream), - read_info.length)); + // `device_read_async` may not use _stream at all. + // Instead, it may use some other stream(s) to sync the H->D memcpy. + // As such, we need to make sure the device buffers in `lvl_stripe_data` are ready first. + if (!stream_synchronized) { + _stream.synchronize(); + stream_synchronized = true; + } + device_read_tasks.emplace_back( + source_ptr->device_read_async( + read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream), + read_info.length); } else { auto buffer = source_ptr->host_read(read_info.offset, read_info.length); @@ -659,8 +668,8 @@ void reader_impl::load_next_stripe_data(read_mode mode) if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { auto const& decompressor = *_metadata.per_file_metadata[0].decompressor; - auto compinfo = cudf::detail::hostdevice_span( - hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size()); + auto compinfo = cudf::detail::hostdevice_span{hd_compinfo}.subspan( + 0, stream_range.size()); for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) { auto const& info = stream_info[stream_idx]; auto const dst_base = diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index d628e936cb1..0081ed30d17 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -22,6 +22,8 @@ #include "io/utilities/hostdevice_span.hpp" #include +#include +#include #include #include #include @@ -32,7 +34,6 @@ #include #include -#include #include #include @@ -451,7 +452,7 @@ void decode_stream_data(int64_t num_dicts, update_null_mask(chunks, out_buffers, stream, mr); } - rmm::device_scalar error_count(0, stream); + cudf::detail::device_scalar error_count(0, stream); gpu::DecodeOrcColumnData(chunks.base_device_ptr(), global_dict.data(), row_groups, @@ -508,21 +509,20 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector const& auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async( prefix_sums_to_update, stream, cudf::get_current_device_resource_ref()); - thrust::for_each( - rmm::exec_policy_nosync(stream), - d_prefix_sums_to_update.begin(), - d_prefix_sums_to_update.end(), - [num_stripes, chunks = cudf::detail::device_2dspan{chunks}] __device__( - auto const& idx_psums) { - auto const col_idx = idx_psums.first; - auto const psums = idx_psums.second; - thrust::transform(thrust::seq, - thrust::make_counting_iterator(0ul), - thrust::make_counting_iterator(num_stripes), - psums, - [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; }); - thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums); - }); + thrust::for_each(rmm::exec_policy_nosync(stream), + d_prefix_sums_to_update.begin(), + d_prefix_sums_to_update.end(), + [num_stripes, chunks = chunks.device_view()] __device__(auto const& idx_psums) { + auto const col_idx = idx_psums.first; + auto const psums = idx_psums.second; + thrust::transform( + thrust::seq, + thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(num_stripes), + psums, + [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; }); + thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums); + }); // `prefix_sums_to_update` goes out of scope, copy has to be done before we return stream.synchronize(); } @@ -554,12 +554,12 @@ void aggregate_child_meta(std::size_t level, col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks); col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols); - auto child_start_row = cudf::detail::host_2dspan( - col_meta.child_start_row.data(), num_of_stripes, num_child_cols); - auto num_child_rows_per_stripe = cudf::detail::host_2dspan( - col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols); + auto child_start_row = + cudf::detail::host_2dspan(col_meta.child_start_row, num_child_cols); + auto num_child_rows_per_stripe = + cudf::detail::host_2dspan(col_meta.num_child_rows_per_stripe, num_child_cols); auto rwgrp_meta = cudf::detail::host_2dspan( - col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols); + col_meta.rwgrp_meta, num_child_cols); int index = 0; // number of child column processed @@ -951,8 +951,9 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) // Setup row group descriptors if using indexes. if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { - auto compinfo = cudf::detail::hostdevice_span( - hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size()); + auto const compinfo = + cudf::detail::hostdevice_span{hd_compinfo}.subspan( + 0, stream_range.size()); auto decomp_data = decompress_stripe_data(load_stripe_range, stream_range, stripe_count, diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp index 4c1079cffe8..7e5db4b7617 100644 --- a/cpp/src/io/orc/reader_impl_helpers.cpp +++ b/cpp/src/io/orc/reader_impl_helpers.cpp @@ -16,8 +16,6 @@ #include "reader_impl_helpers.hpp" -#include - namespace cudf::io::orc::detail { std::unique_ptr create_empty_column(size_type orc_col_id, diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp index 5528b2ee763..4cded30d89b 100644 --- a/cpp/src/io/orc/reader_impl_helpers.hpp +++ b/cpp/src/io/orc/reader_impl_helpers.hpp @@ -20,9 +20,6 @@ #include "io/orc/orc.hpp" #include "io/utilities/column_buffer.hpp" -#include -#include - #include #include diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 5c70e35fd2e..ed0b6969154 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -20,6 +20,8 @@ #include "orc_gpu.hpp" #include +#include +#include #include #include #include @@ -1087,37 +1089,42 @@ CUDF_KERNEL void __launch_bounds__(block_size) /** * @brief Merge chunked column data into a single contiguous stream * - * @param[in,out] strm_desc StripeStream device array [stripe][stream] - * @param[in,out] streams List of encoder chunk streams [column][rowgroup] + * @param[in] strm_desc StripeStream device array [stripe][stream] + * @param[in] streams List of encoder chunk streams [column][rowgroup] + * @param[out] srcs List of source encoder chunk stream data addresses + * @param[out] dsts List of destination StripeStream data addresses + * @param[out] sizes List of stream sizes in bytes */ // blockDim {compact_streams_block_size,1,1} CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) - gpuCompactOrcDataStreams(device_2dspan strm_desc, - device_2dspan streams) + gpuInitBatchedMemcpy(device_2dspan strm_desc, + device_2dspan streams, + device_span srcs, + device_span dsts, + device_span sizes) { - __shared__ __align__(16) StripeStream ss; - - auto const stripe_id = blockIdx.x; + auto const stripe_id = cudf::detail::grid_1d::global_thread_id(); auto const stream_id = blockIdx.y; - auto const t = threadIdx.x; + if (stripe_id >= strm_desc.size().first) { return; } - if (t == 0) { ss = strm_desc[stripe_id][stream_id]; } - __syncthreads(); + auto const out_id = stream_id * strm_desc.size().first + stripe_id; + StripeStream ss = strm_desc[stripe_id][stream_id]; if (ss.data_ptr == nullptr) { return; } auto const cid = ss.stream_type; auto dst_ptr = ss.data_ptr; for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) { + auto const out_id = stream_id * streams.size().second + group; + srcs[out_id] = streams[ss.column_id][group].data_ptrs[cid]; + dsts[out_id] = dst_ptr; + + // Also update the stream here, data will be copied in a separate kernel + streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; + auto const len = streams[ss.column_id][group].lengths[cid]; - if (len > 0) { - auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid]; - for (uint32_t i = t; i < len; i += blockDim.x) { - dst_ptr[i] = src_ptr[i]; - } - __syncthreads(); - } - if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; } + // len is the size (in bytes) of the current stream. + sizes[out_id] = len; dst_ptr += len; } } @@ -1325,9 +1332,26 @@ void CompactOrcDataStreams(device_2dspan strm_desc, device_2dspan enc_streams, rmm::cuda_stream_view stream) { + auto const num_rowgroups = enc_streams.size().second; + auto const num_streams = strm_desc.size().second; + auto const num_stripes = strm_desc.size().first; + auto const num_chunks = num_rowgroups * num_streams; + auto srcs = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto dsts = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto lengths = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + dim3 dim_block(compact_streams_block_size, 1); - dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); - gpuCompactOrcDataStreams<<>>(strm_desc, enc_streams); + dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), + strm_desc.size().second); + gpuInitBatchedMemcpy<<>>( + strm_desc, enc_streams, srcs, dsts, lengths); + + // Copy streams in a batched manner. + cudf::detail::batched_memcpy_async( + srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream); } std::optional CompressOrcDataStreams( diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 60a64fb0ee6..d432deb8e79 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -19,7 +19,9 @@ * @brief cuDF-IO ORC writer class implementation */ +#include "cudf/detail/utilities/cuda_memcpy.hpp" #include "io/comp/nvcomp_adapter.hpp" +#include "io/orc/orc_gpu.hpp" #include "io/statistics/column_statistics.cuh" #include "io/utilities/column_utils.cuh" #include "writer_impl.hpp" @@ -717,8 +719,8 @@ std::vector> calculate_aligned_rowgroup_bounds( auto d_pd_set_counts_data = rmm::device_uvector( orc_table.num_columns() * segmentation.num_rowgroups(), stream); - auto const d_pd_set_counts = device_2dspan{ - d_pd_set_counts_data.data(), segmentation.num_rowgroups(), orc_table.num_columns()}; + auto const d_pd_set_counts = + device_2dspan{d_pd_set_counts_data, orc_table.num_columns()}; gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream); auto aligned_rgs = hostdevice_2dvector( @@ -739,7 +741,7 @@ std::vector> calculate_aligned_rowgroup_bounds( [columns = device_span{orc_table.d_columns}, stripes = device_span{d_stripes}, d_pd_set_counts, - out_rowgroups = device_2dspan{aligned_rgs}] __device__(auto& idx) { + out_rowgroups = aligned_rgs.device_view()] __device__(auto& idx) { uint32_t const col_idx = idx / stripes.size(); // No alignment needed for root columns if (not columns[col_idx].parent_index.has_value()) return; @@ -911,7 +913,7 @@ encoded_data encode_columns(orc_table_view const& orc_table, rmm::exec_policy(stream), thrust::make_counting_iterator(0ul), chunks.count(), - [chunks = device_2dspan{chunks}, + [chunks = chunks.device_view(), cols = device_span{orc_table.d_columns}] __device__(auto& idx) { auto const col_idx = idx / chunks.size().second; auto const rg_idx = idx % chunks.size().second; @@ -1407,7 +1409,8 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, num_entries_seen += stripes_per_col; } - std::vector file_stats_merge(num_file_blobs); + auto file_stats_merge = + cudf::detail::make_host_vector(num_file_blobs, stream); for (auto i = 0u; i < num_file_blobs; ++i) { auto col_stats = &file_stats_merge[i]; col_stats->col_dtype = per_chunk_stats.col_types[i]; @@ -1417,11 +1420,10 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, } auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_file_stats_merge, - file_stats_merge.data(), - num_file_blobs * sizeof(statistics_merge_group), - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{stats_merge.device_ptr(num_stripe_blobs), num_file_blobs}, + file_stats_merge, + stream); auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs; detail::merge_group_statistics( @@ -1572,7 +1574,7 @@ void write_index_stream(int32_t stripe_id, * @param[in] strm_desc Stream's descriptor * @param[in] enc_stream Chunk's streams * @param[in] compressed_data Compressed stream data - * @param[in,out] stream_out Temporary host output buffer + * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer * @param[in,out] stripe Stream's parent stripe * @param[in,out] streams List of all streams * @param[in] compression_kind The compression kind @@ -1583,7 +1585,7 @@ void write_index_stream(int32_t stripe_id, std::future write_data_stream(gpu::StripeStream const& strm_desc, gpu::encoder_chunk_streams const& enc_stream, uint8_t const* compressed_data, - uint8_t* stream_out, + host_span bounce_buffer, StripeInformation* stripe, orc_streams* streams, CompressionKind compression_kind, @@ -1603,11 +1605,10 @@ std::future write_data_stream(gpu::StripeStream const& strm_desc, if (out_sink->is_device_write_preferred(length)) { return out_sink->device_write_async(stream_in, length, stream); } else { - CUDF_CUDA_TRY( - cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + cudf::detail::cuda_memcpy( + bounce_buffer.subspan(0, length), device_span{stream_in, length}, stream); - out_sink->host_write(stream_out, length); + out_sink->host_write(bounce_buffer.data(), length); return std::async(std::launch::deferred, [] {}); } }(); @@ -1897,7 +1898,7 @@ hostdevice_2dvector calculate_rowgroup_bounds(orc_table_view cons thrust::make_counting_iterator(0ul), num_rowgroups, [cols = device_span{orc_table.d_columns}, - rg_bounds = device_2dspan{rowgroup_bounds}, + rg_bounds = rowgroup_bounds.device_view(), rowgroup_size] __device__(auto rg_idx) mutable { thrust::transform( thrust::seq, cols.begin(), cols.end(), rg_bounds[rg_idx].begin(), [&](auto const& col) { @@ -1987,8 +1988,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, d_tmp_rowgroup_sizes.end(), [src = esizes.data(), col_idx = col_idx, - rg_bounds = device_2dspan{ - segmentation.rowgroups}] __device__(auto idx) { + rg_bounds = segmentation.rowgroups.device_view()] __device__(auto idx) { return src[rg_bounds[idx][col_idx].end - 1]; }); @@ -2050,7 +2050,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table, auto const num_str_cols = orc_table.num_string_columns(); auto counts = rmm::device_uvector(num_str_cols * num_rowgroups, stream); - auto counts_2d_view = device_2dspan(counts.data(), num_str_cols, num_rowgroups); + auto counts_2d_view = device_2dspan(counts, num_rowgroups); gpu::rowgroup_char_counts(counts_2d_view, orc_table.d_columns, rowgroup_bounds, @@ -2110,7 +2110,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, bool sort_dictionaries, rmm::cuda_stream_view stream) { - std::vector>> hash_maps_storage( + // Variable to keep track of the current total map storage size + size_t total_map_storage_size = 0; + std::vector> hash_maps_storage_offsets( orc_table.string_column_indices.size()); for (auto col_idx : orc_table.string_column_indices) { auto& str_column = orc_table.column(col_idx); @@ -2119,14 +2121,21 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, stripe.size == 0 ? 0 : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end - segmentation.rowgroups[stripe.first][col_idx].begin; - hash_maps_storage[str_column.str_index()].emplace_back(stripe_num_rows * 1.43, stream); + hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size); + total_map_storage_size += stripe_num_rows * gpu::occupancy_factor; } + hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size); } hostdevice_2dvector stripe_dicts( orc_table.num_string_columns(), segmentation.num_stripes(), stream); if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}}; + // Create a single bulk storage to use for all sub-dictionaries + auto map_storage = std::make_unique( + total_map_storage_size, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}); + // Initialize stripe dictionaries for (auto col_idx : orc_table.string_column_indices) { auto& str_column = orc_table.column(col_idx); @@ -2137,7 +2146,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, auto const stripe_idx = stripe.id; auto& sd = stripe_dicts[str_col_idx][stripe_idx]; - sd.map_slots = hash_maps_storage[str_col_idx][stripe_idx]; + sd.map_slots = {map_storage->data() + hash_maps_storage_offsets[str_col_idx][stripe_idx], + hash_maps_storage_offsets[str_col_idx][stripe_idx + 1] - + hash_maps_storage_offsets[str_col_idx][stripe_idx]}; sd.column_idx = col_idx; sd.start_row = segmentation.rowgroups[stripe.first][col_idx].begin; sd.start_rowgroup = stripe.first; @@ -2150,7 +2161,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, } stripe_dicts.host_to_device_async(stream); - gpu::initialize_dictionary_hash_maps(stripe_dicts, stream); + map_storage->initialize_async({gpu::KEY_SENTINEL, gpu::VALUE_SENTINEL}, {stream.value()}); gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream); // Copy the entry counts and char counts from the device to the host stripe_dicts.device_to_host_sync(stream); @@ -2184,8 +2195,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, col_use_dictionary = true; } else { // Clear hash map storage as dictionary encoding is not used for this stripe - hash_maps_storage[str_col_idx][stripe_idx] = rmm::device_uvector(0, stream); - sd.map_slots = {}; + sd.map_slots = {}; } } // If any stripe uses dictionary encoding, allocate index storage for the whole column @@ -2203,7 +2213,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream); // deallocate hash map storage, unused after this point - hash_maps_storage.clear(); + map_storage.reset(); // Clear map slots and attach order buffers auto dictionaries_flat = stripe_dicts.host_view().flat_view(); @@ -2606,7 +2616,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, strm_desc, enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first], compressed_data.data(), - bounce_buffer.data(), + bounce_buffer, &stripe, &streams, _compression_kind, diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index ddf65e9020f..d15435b2553 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -27,7 +27,6 @@ #include "ipc/Schema_generated.h" #include "writer_impl_helpers.hpp" -#include #include #include diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 9bc435bf6c8..66810ee163a 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -22,10 +22,9 @@ #pragma once #include -#include -#include +#include +#include #include -#include namespace cudf::io::parquet::detail { diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 17ccb73c0a8..b85ebf2fa1a 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -84,7 +84,7 @@ struct map_insert_fn { storage_ref}; // Create a map ref with `cuco::insert` operator - auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); + auto map_insert_ref = hash_map_ref.rebind_operators(cuco::insert); auto const t = threadIdx.x; // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size @@ -186,7 +186,7 @@ struct map_find_fn { storage_ref}; // Create a map ref with `cuco::find` operator - auto const map_find_ref = hash_map_ref.with_operators(cuco::find); + auto const map_find_ref = hash_map_ref.rebind_operators(cuco::find); auto const t = threadIdx.x; // Note: Adjust the following loop to use `cg::tiles` if needed in the future. @@ -194,17 +194,12 @@ struct map_find_fn { val_idx += block_size) { // Find the key using a single thread for best performance for now. if (data_col.is_valid(val_idx)) { + auto const found_slot = map_find_ref.find(val_idx); + // Fail if we didn't find the previously inserted key. + cudf_assert(found_slot != map_find_ref.end() && + "Unable to find value in map in dictionary index construction"); // No need for atomic as this is not going to be modified by any other thread. - chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() { - auto const found_slot = map_find_ref.find(val_idx); - - // Fail if we didn't find the previously inserted key. - cudf_assert(found_slot != map_find_ref.end() && - "Unable to find value in map in dictionary index construction"); - - // Return the found value. - return found_slot->second; - }(); + chunk->dict_index[val_idx - s_ck_start_val_idx] = found_slot->second; } } } else { diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index b978799b8bc..d276e946a51 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -228,7 +228,8 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { +class parquet_field_string_list : public parquet_field_list { + public: parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) { @@ -308,10 +309,10 @@ class parquet_field_struct : public parquet_field { template class parquet_field_union_struct : public parquet_field { E& enum_val; - cuda::std::optional& val; // union structs are always wrapped in std::optional + std::optional& val; // union structs are always wrapped in std::optional public: - parquet_field_union_struct(int f, E& ev, cuda::std::optional& v) + parquet_field_union_struct(int f, E& ev, std::optional& v) : parquet_field(f), enum_val(ev), val(v) { } @@ -396,8 +397,9 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list +class parquet_field_binary_list : public parquet_field_list, FieldType::BINARY> { + public: parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) { @@ -437,10 +439,10 @@ class parquet_field_struct_blob : public parquet_field { */ template class parquet_field_optional : public parquet_field { - cuda::std::optional& val; + std::optional& val; public: - parquet_field_optional(int f, cuda::std::optional& v) : parquet_field(f), val(v) {} + parquet_field_optional(int f, std::optional& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 12c24e2b848..b87f2e9c692 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -22,10 +22,7 @@ #include #include -#include -#include #include -#include namespace CUDF_EXPORT cudf { namespace io::parquet::detail { diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index d4778b1ea15..05859d60c03 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -17,7 +17,6 @@ #pragma once #include "parquet.hpp" -#include "parquet_common.hpp" #include #include diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 8a866141c4b..9acbe026bb2 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -24,36 +24,115 @@ namespace cudf::io::parquet::detail { namespace { -template -__device__ inline void gpuDecodeFixedWidthValues( +// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. +// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for +// lists. +struct block_scan_results { + uint32_t warp_bits; + int thread_count_within_warp; + int warp_count; + + int thread_count_within_block; + int block_count; +}; + +template +using block_scan_temp_storage = int[decode_block_size / cudf::detail::warp_size]; + +// Similar to CUB, must __syncthreads() after calling if reusing temp_storage +template +__device__ inline static void scan_block_exclusive_sum( + int thread_bit, + block_scan_results& results, + block_scan_temp_storage& temp_storage) +{ + int const t = threadIdx.x; + int const warp_index = t / cudf::detail::warp_size; + int const warp_lane = t % cudf::detail::warp_size; + uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; + + uint32_t warp_bits = ballot(thread_bit); + scan_block_exclusive_sum( + warp_bits, warp_lane, warp_index, lane_mask, results, temp_storage); +} + +// Similar to CUB, must __syncthreads() after calling if reusing temp_storage +template +__device__ static void scan_block_exclusive_sum( + uint32_t warp_bits, + int warp_lane, + int warp_index, + uint32_t lane_mask, + block_scan_results& results, + block_scan_temp_storage& temp_storage) +{ + // Compute # warps + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + + // Compute the warp-wide results + results.warp_bits = warp_bits; + results.warp_count = __popc(results.warp_bits); + results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); + + // Share the warp counts amongst the block threads + if (warp_lane == 0) { temp_storage[warp_index] = results.warp_count; } + __syncthreads(); // Sync to share counts between threads/warps + + // Compute block-wide results + results.block_count = 0; + results.thread_count_within_block = results.thread_count_within_warp; + for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { + results.block_count += temp_storage[warp_idx]; + if (warp_idx < warp_index) { results.thread_count_within_block += temp_storage[warp_idx]; } + } +} + +template +__device__ void gpuDecodeFixedWidthValues( page_state_s* s, state_buf* const sb, int start, int end, int t) { constexpr int num_warps = block_size / cudf::detail::warp_size; constexpr int max_batch_size = num_warps * cudf::detail::warp_size; - PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - int const dtype = s->col.physical_type; + // nesting level that is storing actual leaf values + int const leaf_level_index = s->col.max_nesting_depth - 1; + auto const data_out = s->nesting_info[leaf_level_index].data_out; + + int const dtype = s->col.physical_type; + uint32_t const dtype_len = s->dtype_len; + + int const skipped_leaf_values = s->page.skipped_leaf_values; // decode values int pos = start; while (pos < end) { int const batch_size = min(max_batch_size, end - pos); - int const target_pos = pos + batch_size; - int const src_pos = pos + t; + int const thread_pos = pos + t; - // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)] - s->first_row; + // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) + int const dst_pos = [&]() { + int dst_pos = sb->nz_idx[rolling_index(thread_pos)]; + if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + return dst_pos; + }(); // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. - if (src_pos < target_pos && dst_pos >= 0) { + if (thread_pos < target_pos && dst_pos >= 0) { // nesting level that is storing actual leaf values - int const leaf_level_index = s->col.max_nesting_depth - 1; - uint32_t dtype_len = s->dtype_len; - void* dst = - nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + // src_pos represents the logical row position we want to read from. But in the case of + // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos + // has to take into account the # of values we have to skip in the page to get to the + // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. + int const src_pos = [&]() { + if constexpr (has_lists_t) { return thread_pos + skipped_leaf_values; } + return thread_pos; + }(); + + void* const dst = data_out + (static_cast(dst_pos) * dtype_len); + if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) { switch (dtype) { case INT32: gpuOutputFast(s, sb, src_pos, static_cast(dst)); break; @@ -68,6 +147,8 @@ __device__ inline void gpuDecodeFixedWidthValues( } break; } + } else if (dtype == BOOLEAN) { + gpuOutputBoolean(sb, src_pos, static_cast(dst)); } else if (dtype == INT96) { gpuOutputInt96Timestamp(s, sb, src_pos, static_cast(dst)); } else if (dtype_len == 8) { @@ -92,15 +173,15 @@ __device__ inline void gpuDecodeFixedWidthValues( } } -template +template struct decode_fixed_width_values_func { __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t) { - gpuDecodeFixedWidthValues(s, sb, start, end, t); + gpuDecodeFixedWidthValues(s, sb, start, end, t); } }; -template +template __device__ inline void gpuDecodeFixedWidthSplitValues( page_state_s* s, state_buf* const sb, int start, int end, int t) { @@ -108,10 +189,15 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( constexpr int num_warps = block_size / warp_size; constexpr int max_batch_size = num_warps * warp_size; - PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - int const dtype = s->col.physical_type; - auto const data_len = thrust::distance(s->data_start, s->data_end); - auto const num_values = data_len / s->dtype_len_in; + // nesting level that is storing actual leaf values + int const leaf_level_index = s->col.max_nesting_depth - 1; + auto const data_out = s->nesting_info[leaf_level_index].data_out; + + int const dtype = s->col.physical_type; + auto const data_len = thrust::distance(s->data_start, s->data_end); + auto const num_values = data_len / s->dtype_len_in; + + int const skipped_leaf_values = s->page.skipped_leaf_values; // decode values int pos = start; @@ -119,21 +205,34 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( int const batch_size = min(max_batch_size, end - pos); int const target_pos = pos + batch_size; - int const src_pos = pos + t; + int const thread_pos = pos + t; // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)] - s->first_row; + // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) + int const dst_pos = [&]() { + int dst_pos = sb->nz_idx[rolling_index(thread_pos)]; + if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + return dst_pos; + }(); // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. - if (src_pos < target_pos && dst_pos >= 0) { - // nesting level that is storing actual leaf values - int const leaf_level_index = s->col.max_nesting_depth - 1; + if (thread_pos < target_pos && dst_pos >= 0) { + // src_pos represents the logical row position we want to read from. But in the case of + // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos + // has to take into account the # of values we have to skip in the page to get to the + // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. + int const src_pos = [&]() { + if constexpr (has_lists_t) { + return thread_pos + skipped_leaf_values; + } else { + return thread_pos; + } + }(); - uint32_t dtype_len = s->dtype_len; - uint8_t const* src = s->data_start + src_pos; - uint8_t* dst = - nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + uint32_t const dtype_len = s->dtype_len; + uint8_t const* const src = s->data_start + src_pos; + uint8_t* const dst = data_out + static_cast(dst_pos) * dtype_len; auto const is_decimal = s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; @@ -186,15 +285,15 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( } } -template +template struct decode_fixed_width_split_values_func { __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t) { - gpuDecodeFixedWidthSplitValues(s, sb, start, end, t); + gpuDecodeFixedWidthSplitValues(s, sb, start, end, t); } }; -template +template static __device__ int gpuUpdateValidityAndRowIndicesNested( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t) { @@ -211,29 +310,30 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const row_index_lower_bound = s->row_index_lower_bound; - int const max_depth = s->col.max_nesting_depth - 1; + int const max_depth = s->col.max_nesting_depth - 1; + auto& max_depth_ni = s->nesting_info[max_depth]; + int max_depth_valid_count = max_depth_ni.valid_count; + __syncthreads(); while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // definition level. only need to process for nullable columns - int d = 0; - if constexpr (nullable) { - if (def) { - d = t < batch_size - ? static_cast(def[rolling_index(value_count + t)]) - : -1; - } else { - d = t < batch_size ? 1 : -1; + // definition level + int const d = [&]() { + if (t >= batch_size) { + return -1; + } else if (def) { + return static_cast(def[rolling_index(value_count + t)]); } - } + return 1; + }(); - int const thread_value_count = t + 1; + int const thread_value_count = t; int const block_value_count = batch_size; // compute our row index, whether we're in row bounds, and validity - int const row_index = (thread_value_count + value_count) - 1; + int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store @@ -242,90 +342,76 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( for (int d_idx = 0; d_idx <= max_depth; d_idx++) { auto& ni = s->nesting_info[d_idx]; - int is_valid; - if constexpr (nullable) { - is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; - } else { - is_valid = in_row_bounds; - } + int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; // thread and block validity count + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; int thread_valid_count, block_valid_count; - if constexpr (nullable) { - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; - block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); - __syncthreads(); + block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); - // validity is processed per-warp - // - // nested schemas always read and write to the same bounds (that is, read and write - // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading - // at the first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector - // here we need to adjust our computed mask to take into account the write row bounds. - int warp_null_count = 0; - if (write_start >= 0 && ni.valid_map != nullptr) { - int const valid_map_offset = ni.valid_map_offset; - uint32_t const warp_validity_mask = ballot(is_valid); - // lane 0 from each warp writes out validity - if ((t % cudf::detail::warp_size) == 0) { - int const vindex = - (value_count + thread_value_count) - 1; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map - int const write_end = cudf::detail::warp_size - - __clz(in_write_row_bounds); // last bit in the warp to store - int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); - - store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); - } - } + // validity is processed per-warp + // + // nested schemas always read and write to the same bounds (that is, read and write + // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading + // at the first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + int warp_null_count = 0; + if (ni.valid_map != nullptr) { + uint32_t const warp_validity_mask = ballot(is_valid); + // lane 0 from each warp writes out validity + if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) { + int const valid_map_offset = ni.valid_map_offset; + int const vindex = value_count + thread_value_count; // absolute input value index + int const bit_offset = (valid_map_offset + vindex + write_start) - + first_row; // absolute bit offset into the output validity map + int const write_end = + cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store + int const bit_count = write_end - write_start; + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); - // sum null counts. we have to do it this way instead of just incrementing by (value_count - - // valid_count) because valid_count also includes rows that potentially start before our row - // bounds. if we could come up with a way to clean that up, we could remove this and just - // compute it directly at the end of the kernel. - size_type const block_null_count = - cudf::detail::single_lane_block_sum_reduce(warp_null_count); - if (t == 0) { ni.null_count += block_null_count; } - } - // trivial for non-nullable columns - else { - thread_valid_count = thread_value_count; - block_valid_count = block_value_count; + store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); + } } + // sum null counts. we have to do it this way instead of just incrementing by (value_count - + // valid_count) because valid_count also includes rows that potentially start before our row + // bounds. if we could come up with a way to clean that up, we could remove this and just + // compute it directly at the end of the kernel. + size_type const block_null_count = + cudf::detail::single_lane_block_sum_reduce(warp_null_count); + if (t == 0) { ni.null_count += block_null_count; } + // if this is valid and we're at the leaf, output dst_pos - __syncthreads(); // handle modification of ni.value_count from below - if (is_valid && d_idx == max_depth) { - // for non-list types, the value count is always the same across - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (ni.valid_count + thread_valid_count) - 1; - sb->nz_idx[rolling_index(src_pos)] = dst_pos; + if (d_idx == max_depth) { + if (is_valid) { + int const dst_pos = value_count + thread_value_count; + int const src_pos = max_depth_valid_count + thread_valid_count; + + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + // update stuff + max_depth_valid_count += block_valid_count; } - __syncthreads(); // handle modification of ni.value_count from below - // update stuff - if (t == 0) { ni.valid_count += block_valid_count; } - } + } // end depth loop value_count += block_value_count; - } + } // end loop if (t == 0) { // update valid value count for decoding and total # of values we've processed - s->nz_count = s->nesting_info[max_depth].valid_count; - s->input_value_count = value_count; - s->input_row_count = value_count; + max_depth_ni.valid_count = max_depth_valid_count; + s->nz_count = max_depth_valid_count; + s->input_value_count = value_count; + s->input_row_count = value_count; } - __syncthreads(); - return s->nesting_info[max_depth].valid_count; + return max_depth_valid_count; } -template +template static __device__ int gpuUpdateValidityAndRowIndicesFlat( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t) { @@ -351,95 +437,144 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // definition level. only need to process for nullable columns - int d = 0; - if constexpr (nullable) { - if (def) { - d = t < batch_size - ? static_cast(def[rolling_index(value_count + t)]) - : -1; - } else { - d = t < batch_size ? 1 : -1; - } - } - - int const thread_value_count = t + 1; + int const thread_value_count = t; int const block_value_count = batch_size; // compute our row index, whether we're in row bounds, and validity - int const row_index = (thread_value_count + value_count) - 1; + int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); - int is_valid; - if constexpr (nullable) { - is_valid = ((d > 0) && in_row_bounds) ? 1 : 0; - } else { - is_valid = in_row_bounds; - } + + // use definition level & row bounds to determine if is valid + int const is_valid = [&]() { + if (t >= batch_size) { + return 0; + } else if (def) { + int const def_level = + static_cast(def[rolling_index(value_count + t)]); + return ((def_level > 0) && in_row_bounds) ? 1 : 0; + } + return in_row_bounds; + }(); // thread and block validity count + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; int thread_valid_count, block_valid_count; - if constexpr (nullable) { - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; - block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); - __syncthreads(); + block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); + uint32_t const warp_validity_mask = ballot(is_valid); + + // validity is processed per-warp + // + // nested schemas always read and write to the same bounds (that is, read and write + // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading + // at the first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); + int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store + int warp_null_count = 0; + // lane 0 from each warp writes out validity + if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) { + int const vindex = value_count + thread_value_count; // absolute input value index + int const bit_offset = (valid_map_offset + vindex + write_start) - + first_row; // absolute bit offset into the output validity map + int const write_end = + cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store + int const bit_count = write_end - write_start; + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); + } - // validity is processed per-warp - // - // nested schemas always read and write to the same bounds (that is, read and write - // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading - // at the first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector - // here we need to adjust our computed mask to take into account the write row bounds. - int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); - int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store - int warp_null_count = 0; - if (write_start >= 0) { - uint32_t const warp_validity_mask = ballot(is_valid); - // lane 0 from each warp writes out validity - if ((t % cudf::detail::warp_size) == 0) { - int const vindex = (value_count + thread_value_count) - 1; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map - int const write_end = - cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store - int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + // sum null counts. we have to do it this way instead of just incrementing by (value_count - + // valid_count) because valid_count also includes rows that potentially start before our row + // bounds. if we could come up with a way to clean that up, we could remove this and just + // compute it directly at the end of the kernel. + size_type const block_null_count = + cudf::detail::single_lane_block_sum_reduce(warp_null_count); + if (t == 0) { ni.null_count += block_null_count; } - store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); - } - } + // output offset + if (is_valid) { + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; - // sum null counts. we have to do it this way instead of just incrementing by (value_count - - // valid_count) because valid_count also includes rows that potentially start before our row - // bounds. if we could come up with a way to clean that up, we could remove this and just - // compute it directly at the end of the kernel. - size_type const block_null_count = - cudf::detail::single_lane_block_sum_reduce(warp_null_count); - if (t == 0) { ni.null_count += block_null_count; } - } - // trivial for non-nullable columns - else { - thread_valid_count = thread_value_count; - block_valid_count = block_value_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } - // output offset + // update stuff + value_count += block_value_count; + valid_count += block_valid_count; + } + + if (t == 0) { + // update valid value count for decoding and total # of values we've processed + ni.valid_count = valid_count; + ni.value_count = value_count; + s->nz_count = valid_count; + s->input_value_count = value_count; + s->input_row_count = value_count; + } + + return valid_count; +} + +template +static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_value_count, + page_state_s* s, + state_buf* sb, + int t) +{ + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + constexpr int max_batch_size = num_warps * cudf::detail::warp_size; + + // cap by last row so that we don't process any rows past what we want to output. + int const first_row = s->first_row; + int const last_row = first_row + s->num_rows; + int const capped_target_value_count = min(target_value_count, last_row); + int const row_index_lower_bound = s->row_index_lower_bound; + + // how many (input) values we've processed in the page so far + int value_count = s->input_value_count; + + int const max_depth = s->col.max_nesting_depth - 1; + auto& ni = s->nesting_info[max_depth]; + int valid_count = ni.valid_count; + + __syncthreads(); + + while (value_count < capped_target_value_count) { + int const batch_size = min(max_batch_size, capped_target_value_count - value_count); + + int const thread_value_count = t; + int const block_value_count = batch_size; + + // compute our row index, whether we're in row bounds, and validity + int const row_index = thread_value_count + value_count; + int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + int const is_valid = in_row_bounds; + int const thread_valid_count = thread_value_count; + int const block_valid_count = block_value_count; + + // if this is valid and we're at the leaf, output dst_pos if (is_valid) { - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (valid_count + thread_valid_count) - 1; + // for non-list types, the value count is always the same across + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } // update stuff value_count += block_value_count; valid_count += block_valid_count; - } + } // end loop if (t == 0) { // update valid value count for decoding and total # of values we've processed ni.valid_count = valid_count; - ni.value_count = value_count; // TODO: remove? this is unused in the non-list path + ni.value_count = value_count; s->nz_count = valid_count; s->input_value_count = value_count; s->input_row_count = value_count; @@ -448,6 +583,239 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( return valid_count; } +template +static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_count, + page_state_s* s, + state_buf* sb, + level_t const* const def, + level_t const* const rep, + int t) +{ + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + constexpr int max_batch_size = num_warps * cudf::detail::warp_size; + + // how many (input) values we've processed in the page so far, prior to this loop iteration + int value_count = s->input_value_count; + + // how many rows we've processed in the page so far + int input_row_count = s->input_row_count; + + // cap by last row so that we don't process any rows past what we want to output. + int const first_row = s->first_row; + int const last_row = first_row + s->num_rows; + + int const row_index_lower_bound = s->row_index_lower_bound; + int const max_depth = s->col.max_nesting_depth - 1; + int max_depth_valid_count = s->nesting_info[max_depth].valid_count; + + int const warp_index = t / cudf::detail::warp_size; + int const warp_lane = t % cudf::detail::warp_size; + bool const is_first_lane = (warp_lane == 0); + + __syncthreads(); + __shared__ block_scan_temp_storage temp_storage; + + while (value_count < target_value_count) { + bool const within_batch = value_count + t < target_value_count; + + // get definition level, use repetition level to get start/end depth + // different for each thread, as each thread has a different r/d + auto const [def_level, start_depth, end_depth] = [&]() { + if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); } + + int const level_index = rolling_index(value_count + t); + int const rep_level = static_cast(rep[level_index]); + int const start_depth = s->nesting_info[rep_level].start_depth; + + if constexpr (!nullable) { + return cuda::std::make_tuple(-1, start_depth, max_depth); + } else { + if (def != nullptr) { + int const def_level = static_cast(def[level_index]); + return cuda::std::make_tuple( + def_level, start_depth, s->nesting_info[def_level].end_depth); + } else { + return cuda::std::make_tuple(1, start_depth, max_depth); + } + } + }(); + + // Determine value count & row index + // track (page-relative) row index for the thread so we can compare against input bounds + // keep track of overall # of rows we've read. + int const is_new_row = start_depth == 0 ? 1 : 0; + int num_prior_new_rows, total_num_new_rows; + { + block_scan_results new_row_scan_results; + scan_block_exclusive_sum(is_new_row, new_row_scan_results, temp_storage); + __syncthreads(); + num_prior_new_rows = new_row_scan_results.thread_count_within_block; + total_num_new_rows = new_row_scan_results.block_count; + } + + int const row_index = input_row_count + ((num_prior_new_rows + is_new_row) - 1); + input_row_count += total_num_new_rows; + int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + // VALUE COUNT: + // in_nesting_bounds: if at a nesting level where we need to add value indices + // the bounds: from current rep to the rep AT the def depth + int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; + int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count; + { + block_scan_results value_count_scan_results; + scan_block_exclusive_sum( + in_nesting_bounds, value_count_scan_results, temp_storage); + __syncthreads(); + + thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; + warp_value_count = value_count_scan_results.warp_count; + thread_value_count = value_count_scan_results.thread_count_within_block; + block_value_count = value_count_scan_results.block_count; + } + + // iterate by depth + for (int d_idx = 0; d_idx <= max_depth; d_idx++) { + auto& ni = s->nesting_info[d_idx]; + + // everything up to the max_def_level is a non-null value + int const is_valid = [&](int input_def_level) { + if constexpr (nullable) { + return ((input_def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0; + } else { + return in_nesting_bounds; + } + }(def_level); + + // VALID COUNT: + // Not all values visited by this block will represent a value at this nesting level. + // the validity bit for thread t might actually represent output value t-6. + // the correct position for thread t's bit is thread_value_count. + uint32_t const warp_valid_mask = + WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp); + int thread_valid_count, block_valid_count; + { + auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; + + block_scan_results valid_count_scan_results; + scan_block_exclusive_sum(warp_valid_mask, + warp_lane, + warp_index, + thread_mask, + valid_count_scan_results, + temp_storage); + __syncthreads(); + thread_valid_count = valid_count_scan_results.thread_count_within_block; + block_valid_count = valid_count_scan_results.block_count; + } + + // compute warp and thread value counts for the -next- nesting level. we need to + // do this for lists so that we can emit an offset for the -current- nesting level. + // the offset for the current nesting level == current length of the next nesting level + int next_thread_value_count_within_warp = 0, next_warp_value_count = 0; + int next_thread_value_count = 0, next_block_value_count = 0; + int next_in_nesting_bounds = 0; + if (d_idx < max_depth) { + // NEXT DEPTH VALUE COUNT: + next_in_nesting_bounds = + ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0; + { + block_scan_results next_value_count_scan_results; + scan_block_exclusive_sum( + next_in_nesting_bounds, next_value_count_scan_results, temp_storage); + __syncthreads(); + + next_thread_value_count_within_warp = + next_value_count_scan_results.thread_count_within_warp; + next_warp_value_count = next_value_count_scan_results.warp_count; + next_thread_value_count = next_value_count_scan_results.thread_count_within_block; + next_block_value_count = next_value_count_scan_results.block_count; + } + + // STORE OFFSET TO THE LIST LOCATION + // if we're -not- at a leaf column and we're within nesting/row bounds + // and we have a valid data_out pointer, it implies this is a list column, so + // emit an offset. + if (in_nesting_bounds && ni.data_out != nullptr) { + const auto& next_ni = s->nesting_info[d_idx + 1]; + int const idx = ni.value_count + thread_value_count; + cudf::size_type const ofs = + next_ni.value_count + next_thread_value_count + next_ni.page_start_value; + + (reinterpret_cast(ni.data_out))[idx] = ofs; + } + } + + // validity is processed per-warp (on lane 0's) + // thi is because when atomic writes are needed, they are 32-bit operations + // + // lists always read and write to the same bounds + // (that is, read and write positions are already pre-bounded by first_row/num_rows). + // since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + if constexpr (nullable) { + if (is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) { + // absolute bit offset into the output validity map + // is cumulative sum of warp_value_count at the given nesting depth + // DON'T subtract by first_row: since it's lists it's not 1-row-per-value + int const bit_offset = ni.valid_map_offset + thread_value_count; + + store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count); + } + + if (t == 0) { ni.null_count += block_value_count - block_valid_count; } + } + + // if this is valid and we're at the leaf, output dst_pos + // Read value_count before the sync, so that when thread 0 modifies it we've already read its + // value + int const current_value_count = ni.value_count; + __syncthreads(); // guard against modification of ni.value_count below + if (d_idx == max_depth) { + if (is_valid) { + int const dst_pos = current_value_count + thread_value_count; + int const src_pos = max_depth_valid_count + thread_valid_count; + int const output_index = rolling_index(src_pos); + + // Index from rolling buffer of values (which doesn't include nulls) to final array (which + // includes gaps for nulls) + sb->nz_idx[output_index] = dst_pos; + } + max_depth_valid_count += block_valid_count; + } + + // update stuff + if (t == 0) { + ni.value_count += block_value_count; + ni.valid_map_offset += block_value_count; + } + __syncthreads(); // sync modification of ni.value_count + + // propagate value counts for the next depth level + block_value_count = next_block_value_count; + thread_value_count = next_thread_value_count; + in_nesting_bounds = next_in_nesting_bounds; + warp_value_count = next_warp_value_count; + thread_value_count_within_warp = next_thread_value_count_within_warp; + } // END OF DEPTH LOOP + + int const batch_size = min(max_batch_size, target_value_count - value_count); + value_count += batch_size; + } + + if (t == 0) { + // update valid value count for decoding and total # of values we've processed + s->nesting_info[max_depth].valid_count = max_depth_valid_count; + s->nz_count = max_depth_valid_count; + s->input_value_count = value_count; + + // If we have lists # rows != # values + s->input_row_count = input_row_count; + } + + return max_depth_valid_count; +} + // is the page marked nullable or not __device__ inline bool is_nullable(page_state_s* s) { @@ -475,6 +843,51 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) return run_val != s->col.max_level[lvl]; } +template +inline __device__ void bool_plain_decode(page_state_s* s, state_buf* sb, int t, int to_decode) +{ + int pos = s->dict_pos; + int const target_pos = pos + to_decode; + __syncthreads(); // Make sure all threads have read dict_pos before it changes at the end. + + while (pos < target_pos) { + int const batch_len = min(target_pos - pos, decode_block_size_t); + + if (t < batch_len) { + int const bit_pos = pos + t; + int const byte_offset = bit_pos >> 3; + int const bit_in_byte_index = bit_pos & 7; + + uint8_t const* const read_from = s->data_start + byte_offset; + bool const read_bit = (*read_from) & (1 << bit_in_byte_index); + + int const write_to_index = rolling_index(bit_pos); + sb->dict_idx[write_to_index] = read_bit; + } + + pos += batch_len; + } + + if (t == 0) { s->dict_pos = pos; } +} + +template +__device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) +{ + // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000: + // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front + // modulo 2 * block_size of course, since that's as many as we process at once + int num_skipped = parquet_stream.skip_decode(t, num_to_skip); + while (num_skipped < num_to_skip) { + // TODO: Instead of decoding, skip within the run to the appropriate location + auto const to_decode = min(rolling_buf_size, num_to_skip - num_skipped); + num_skipped += parquet_stream.decode_next(t, to_decode); + __syncthreads(); + } + + return num_skipped; +} + /** * @brief Kernel for computing fixed width non dictionary column data stored in the pages * @@ -489,26 +902,41 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) * @param num_rows Maximum number of rows to read * @param error_code Error code to set if an error is encountered */ -template - typename DecodeValuesFunc> -CUDF_KERNEL void __launch_bounds__(decode_block_size_t) +template +CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) gpuDecodePageDataGeneric(PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows, kernel_error::pointer error_code) { + constexpr bool has_dict_t = (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_LIST); + constexpr bool has_bools_t = (kernel_mask_t == decode_kernel_mask::BOOLEAN) || + (kernel_mask_t == decode_kernel_mask::BOOLEAN_NESTED) || + (kernel_mask_t == decode_kernel_mask::BOOLEAN_LIST); + constexpr bool has_nesting_t = + (kernel_mask_t == decode_kernel_mask::BOOLEAN_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED); + constexpr bool has_lists_t = + (kernel_mask_t == decode_kernel_mask::BOOLEAN_LIST) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_DICT_LIST) || + (kernel_mask_t == decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); + constexpr bool split_decode_t = + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) || + (kernel_mask_t == decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); + constexpr int rolling_buf_size = decode_block_size_t * 2; constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); __shared__ __align__(16) page_state_s state_g; using state_buf_t = page_state_buffers_s; __shared__ __align__(16) state_buf_t state_buffers; @@ -536,34 +964,31 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. if (s->num_rows == 0) { return; } - DecodeValuesFunc decode_values; + using value_decoder_type = std::conditional_t< + split_decode_t, + decode_fixed_width_split_values_func, + decode_fixed_width_values_func>; + value_decoder_type decode_values; - bool const nullable = is_nullable(s); - bool const should_process_nulls = nullable && maybe_has_nulls(s); + bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s); // shared buffer. all shared memory is suballocated out of here - // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * - // sizeof(rle_run), size_t{16}) : 0; - constexpr int shared_dict_size = - has_dict_t - ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) - : 0; - constexpr int shared_def_size = - cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); - constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size; + constexpr int rle_run_buffer_bytes = + cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); + constexpr int shared_buf_size = + rle_run_buffer_bytes * (static_cast(has_dict_t) + static_cast(has_bools_t) + + static_cast(has_lists_t) + 1); __shared__ __align__(16) uint8_t shared_buf[shared_buf_size]; // setup all shared memory buffers int shared_offset = 0; - /* - rle_run *rep_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_lists_t){ - shared_offset += shared_rep_size; - } - */ - rle_run* dict_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_dict_t) { shared_offset += shared_dict_size; } - rle_run* def_runs = reinterpret_cast*>(shared_buf + shared_offset); + auto rep_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_lists_t) { shared_offset += rle_run_buffer_bytes; } + auto dict_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_dict_t) { shared_offset += rle_run_buffer_bytes; } + auto bool_runs = reinterpret_cast(shared_buf + shared_offset); + if constexpr (has_bools_t) { shared_offset += rle_run_buffer_bytes; } + auto def_runs = reinterpret_cast(shared_buf + shared_offset); // initialize the stream decoders (requires values computed in setupLocalPageInfo) rle_stream def_decoder{def_runs}; @@ -575,37 +1000,63 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) def, s->page.num_input_values); } - /* + rle_stream rep_decoder{rep_runs}; level_t* const rep = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); - if constexpr(has_lists_t){ + if constexpr (has_lists_t) { rep_decoder.init(s->col.level_bits[level_type::REPETITION], s->abs_lvl_start[level_type::REPETITION], s->abs_lvl_end[level_type::REPETITION], rep, s->page.num_input_values); } - */ rle_stream dict_stream{dict_runs}; if constexpr (has_dict_t) { dict_stream.init( s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); } + + // Use dictionary stream memory for bools + rle_stream bool_stream{bool_runs}; + bool bools_are_rle_stream = (s->dict_run == 0); + if constexpr (has_bools_t) { + if (bools_are_rle_stream) { + bool_stream.init(1, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); + } + } __syncthreads(); // We use two counters in the loop below: processed_count and valid_count. - // - processed_count: number of rows out of num_input_values that we have decoded so far. + // - processed_count: number of values out of num_input_values that we have decoded so far. // the definition stream returns the number of total rows it has processed in each call // to decode_next and we accumulate in process_count. - // - valid_count: number of non-null rows we have decoded so far. In each iteration of the + // - valid_count: number of non-null values we have decoded so far. In each iteration of the // loop below, we look at the number of valid items (which could be all for non-nullable), // and valid_count is that running count. int processed_count = 0; int valid_count = 0; + + // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists) + if constexpr (has_lists_t) { + auto const skipped_leaf_values = s->page.skipped_leaf_values; + if (skipped_leaf_values > 0) { + if (should_process_nulls) { + skip_decode(def_decoder, skipped_leaf_values, t); + } + processed_count = skip_decode(rep_decoder, skipped_leaf_values, t); + if constexpr (has_dict_t) { + skip_decode(dict_stream, skipped_leaf_values, t); + } + } + } + // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues - while (s->error == 0 && processed_count < s->page.num_input_values) { + // For chunked reads we may not process all of the rows on the page; if not stop early + int const last_row = s->first_row + s->num_rows; + while ((s->error == 0) && (processed_count < s->page.num_input_values) && + (s->input_row_count <= last_row)) { int next_valid_count; // only need to process definition levels if this is a nullable column @@ -613,11 +1064,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) processed_count += def_decoder.decode_next(t); __syncthreads(); - if constexpr (has_nesting_t) { - next_valid_count = gpuUpdateValidityAndRowIndicesNested( + if constexpr (has_lists_t) { + rep_decoder.decode_next(t); + __syncthreads(); + next_valid_count = gpuUpdateValidityAndRowIndicesLists( + processed_count, s, sb, def, rep, t); + } else if constexpr (has_nesting_t) { + next_valid_count = gpuUpdateValidityAndRowIndicesNested( processed_count, s, sb, def, t); } else { - next_valid_count = gpuUpdateValidityAndRowIndicesFlat( + next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, def, t); } } @@ -625,26 +1081,33 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // this function call entirely since all it will ever generate is a mapping of (i -> i) for // nz_idx. gpuDecodeFixedWidthValues would be the only work that happens. else { - processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); - - if constexpr (has_nesting_t) { - next_valid_count = - gpuUpdateValidityAndRowIndicesNested( - processed_count, s, sb, nullptr, t); + if constexpr (has_lists_t) { + processed_count += rep_decoder.decode_next(t); + __syncthreads(); + next_valid_count = gpuUpdateValidityAndRowIndicesLists( + processed_count, s, sb, nullptr, rep, t); } else { - next_valid_count = gpuUpdateValidityAndRowIndicesFlat( - processed_count, s, sb, nullptr, t); + processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); + next_valid_count = + gpuUpdateValidityAndRowIndicesNonNullable(processed_count, s, sb, t); } } __syncthreads(); - // if we have dictionary data + // if we have dictionary or bool data + // We want to limit the number of dictionary/bool items we decode, that correspond to + // the rows we have processed in this iteration that are valid. + // We know the number of valid rows to process with: next_valid_count - valid_count. if constexpr (has_dict_t) { - // We want to limit the number of dictionary items we decode, that correspond to - // the rows we have processed in this iteration that are valid. - // We know the number of valid rows to process with: next_valid_count - valid_count. dict_stream.decode_next(t, next_valid_count - valid_count); __syncthreads(); + } else if constexpr (has_bools_t) { + if (bools_are_rle_stream) { + bool_stream.decode_next(t, next_valid_count - valid_count); + } else { + bool_plain_decode(s, sb, t, next_valid_count - valid_count); + } + __syncthreads(); } // decode the values themselves @@ -658,175 +1121,82 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) } // anonymous namespace -void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) -{ - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); - dim3 dim_grid(pages.size(), 1); // 1 threadblock per page - - if (level_type_size == 1) { - if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } -} +template +using kernel_tag_t = std::integral_constant; -void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) -{ - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block - dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks - - if (level_type_size == 1) { - if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } -} +template +using int_tag_t = std::integral_constant; -void __host__ -DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream) +void __host__ DecodePageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + decode_kernel_mask kernel_mask, + kernel_error::pointer error_code, + rmm::cuda_stream_view stream) { - constexpr int decode_block_size = 128; - - dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block - dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks - - if (level_type_size == 1) { - if (has_nesting) { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } else { - gpuDecodePageDataGeneric - <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); - } - } else { - if (has_nesting) { - gpuDecodePageDataGeneric + // No template parameters on lambdas until C++20, so use type tags instead + auto launch_kernel = [&](auto block_size_tag, auto kernel_mask_tag) { + constexpr int decode_block_size = decltype(block_size_tag)::value; + constexpr decode_kernel_mask mask = decltype(kernel_mask_tag)::value; + + dim3 dim_block(decode_block_size, 1); + dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + + if (level_type_size == 1) { + gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } else { - gpuDecodePageDataGeneric + gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } + }; + + switch (kernel_mask) { + case decode_kernel_mask::FIXED_WIDTH_NO_DICT: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT_NESTED: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::FIXED_WIDTH_DICT_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST: + launch_kernel(int_tag_t<128>{}, + kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN_NESTED: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + case decode_kernel_mask::BOOLEAN_LIST: + launch_kernel(int_tag_t<128>{}, kernel_tag_t{}); + break; + default: CUDF_EXPECTS(false, "Kernel type not handled by this function"); break; } } diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 62f1ee88036..5b9831668e6 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -343,8 +343,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; // the level stream decoders - __shared__ rle_run def_runs[rle_run_buffer_size]; - __shared__ rle_run rep_runs[rle_run_buffer_size]; + __shared__ rle_run def_runs[rle_run_buffer_size]; + __shared__ rle_run rep_runs[rle_run_buffer_size]; rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp index f0fc9fab3ab..8b3d1d7a6c3 100644 --- a/cpp/src/io/parquet/error.hpp +++ b/cpp/src/io/parquet/error.hpp @@ -26,7 +26,7 @@ namespace cudf::io::parquet { /** - * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in + * @brief Specialized device scalar for use in reporting errors that occur in * kernel calls. * * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index e0d50d7ccf9..0d24fa4236f 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -17,8 +17,11 @@ #include "page_data.cuh" #include "page_decode.cuh" +#include + #include +#include #include namespace cudf::io::parquet::detail { @@ -466,4 +469,28 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, } } +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream) +{ + // Copy offsets to device and create an iterator + auto d_src_data = cudf::detail::make_device_uvector_async( + offsets, stream, cudf::get_current_device_resource_ref()); + // Iterator for the source (scalar) data + auto src_iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; })); + + // Copy buffer addresses to device and create an iterator + auto d_dst_addrs = cudf::detail::make_device_uvector_async( + buff_addrs, stream, cudf::get_current_device_resource_ref()); + // size_iter is simply a constant iterator of sizeof(size_type) bytes. + auto size_iter = thrust::make_constant_iterator(sizeof(size_type)); + + // Copy offsets to buffers in batched manner. + cudf::detail::batched_memcpy_async( + src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream); +} + } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 9ed2929a70e..ab40a5b9a55 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -149,10 +149,21 @@ inline __device__ bool is_bounds_page(page_state_s* const s, size_t const begin = start_row; size_t const end = start_row + num_rows; - // for non-nested schemas, rows cannot span pages, so use a more restrictive test - return has_repetition - ? ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end)) - : ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end)); + // Test for list schemas. + auto const is_bounds_page_lists = + ((page_begin <= begin and page_end >= begin) or (page_begin <= end and page_end >= end)); + + // For non-list schemas, rows cannot span pages, so use a more restrictive test. Make sure to + // relax the test for `page_end` if we adjusted the `num_rows` for the last page to compensate + // for list row size estimates in `generate_list_column_row_count_estimates()` when chunked + // read mode. + auto const test_page_end_nonlists = + s->page.is_num_rows_adjusted ? page_end >= end : page_end > end; + + auto const is_bounds_page_nonlists = + (page_begin < begin and page_end > begin) or (page_begin < end and test_page_end_nonlists); + + return has_repetition ? is_bounds_page_lists : is_bounds_page_nonlists; } /** diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index d604642be54..6aec4ce0ec2 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -181,19 +181,26 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, } else if (is_string_col(chunk)) { // check for string before byte_stream_split so FLBA will go to the right kernel return decode_kernel_mask::STRING; + } else if (is_boolean(chunk)) { + return is_list(chunk) ? decode_kernel_mask::BOOLEAN_LIST + : is_nested(chunk) ? decode_kernel_mask::BOOLEAN_NESTED + : decode_kernel_mask::BOOLEAN; } - if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) { + if (!is_byte_array(chunk)) { if (page.encoding == Encoding::PLAIN) { - return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED - : decode_kernel_mask::FIXED_WIDTH_NO_DICT; + return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST + : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED + : decode_kernel_mask::FIXED_WIDTH_NO_DICT; } else if (page.encoding == Encoding::PLAIN_DICTIONARY || page.encoding == Encoding::RLE_DICTIONARY) { - return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED - : decode_kernel_mask::FIXED_WIDTH_DICT; + return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST + : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED + : decode_kernel_mask::FIXED_WIDTH_DICT; } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) { - return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED - : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT; + return is_list(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST + : is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED + : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT; } } @@ -426,6 +433,7 @@ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks, // definition levels bs->page.chunk_row = 0; bs->page.num_rows = 0; + bs->page.is_num_rows_adjusted = false; bs->page.skipped_values = -1; bs->page.skipped_leaf_values = 0; bs->page.str_bytes = 0; diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index ca74a1c2ba0..5ece3a54892 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -618,8 +618,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBo constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size(); // the level stream decoders - __shared__ rle_run def_runs[rle_run_buffer_size]; - __shared__ rle_run rep_runs[rle_run_buffer_size]; + __shared__ rle_run def_runs[rle_run_buffer_size]; + __shared__ rle_run rep_runs[rle_run_buffer_size]; rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 7c985643887..2851ef67a65 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -20,8 +20,6 @@ #include -#include - #include #include #include @@ -94,10 +92,10 @@ struct LogicalType { BSON }; Type type; - cuda::std::optional decimal_type; - cuda::std::optional time_type; - cuda::std::optional timestamp_type; - cuda::std::optional int_type; + std::optional decimal_type; + std::optional time_type; + std::optional timestamp_type; + std::optional int_type; LogicalType(Type tp = UNDEFINED) : type(tp) {} LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} @@ -178,21 +176,21 @@ struct SchemaElement { // 5: nested fields int32_t num_children = 0; // 6: DEPRECATED: record the original type before conversion to parquet type - cuda::std::optional converted_type; + std::optional converted_type; // 7: DEPRECATED: record the scale for DECIMAL converted type int32_t decimal_scale = 0; // 8: DEPRECATED: record the precision for DECIMAL converted type int32_t decimal_precision = 0; // 9: save field_id from original schema - cuda::std::optional field_id; + std::optional field_id; // 10: replaces converted type - cuda::std::optional logical_type; + std::optional logical_type; // extra cudf specific fields bool output_as_byte_array = false; // cudf type determined from arrow:schema - cuda::std::optional arrow_type; + std::optional arrow_type; // The following fields are filled in later during schema initialization int max_definition_level = 0; @@ -258,21 +256,21 @@ struct SchemaElement { */ struct Statistics { // deprecated max value in signed comparison order - cuda::std::optional> max; + std::optional> max; // deprecated min value in signed comparison order - cuda::std::optional> min; + std::optional> min; // count of null values in the column - cuda::std::optional null_count; + std::optional null_count; // count of distinct values occurring - cuda::std::optional distinct_count; + std::optional distinct_count; // max value for column determined by ColumnOrder - cuda::std::optional> max_value; + std::optional> max_value; // min value for column determined by ColumnOrder - cuda::std::optional> min_value; + std::optional> min_value; // If true, max_value is the actual maximum value for a column - cuda::std::optional is_max_value_exact; + std::optional is_max_value_exact; // If true, min_value is the actual minimum value for a column - cuda::std::optional is_min_value_exact; + std::optional is_min_value_exact; }; /** @@ -281,7 +279,7 @@ struct Statistics { struct SizeStatistics { // Number of variable-width bytes stored for the page/chunk. Should not be set for anything // but the BYTE_ARRAY physical type. - cuda::std::optional unencoded_byte_array_data_bytes; + std::optional unencoded_byte_array_data_bytes; /** * When present, there is expected to be one element corresponding to each * repetition (i.e. size=max repetition_level+1) where each element @@ -290,14 +288,14 @@ struct SizeStatistics { * * This value should not be written if max_repetition_level is 0. */ - cuda::std::optional> repetition_level_histogram; + std::optional> repetition_level_histogram; /** * Same as repetition_level_histogram except for definition levels. * * This value should not be written if max_definition_level is 0 or 1. */ - cuda::std::optional> definition_level_histogram; + std::optional> definition_level_histogram; }; /** @@ -318,7 +316,7 @@ struct OffsetIndex { std::vector page_locations; // per-page size info. see description of the same field in SizeStatistics. only present for // columns with a BYTE_ARRAY physical type. - cuda::std::optional> unencoded_byte_array_data_bytes; + std::optional> unencoded_byte_array_data_bytes; }; /** @@ -329,11 +327,11 @@ struct ColumnIndex { std::vector> min_values; // lower bound for values in each page std::vector> max_values; // upper bound for values in each page BoundaryOrder boundary_order = - BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered - cuda::std::optional> null_counts; // Optional count of null values per page + BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered + std::optional> null_counts; // Optional count of null values per page // Repetition/definition level histograms for the column chunk - cuda::std::optional> repetition_level_histogram; - cuda::std::optional> definition_level_histogram; + std::optional> repetition_level_histogram; + std::optional> definition_level_histogram; }; /** @@ -383,11 +381,11 @@ struct ColumnChunkMetaData { Statistics statistics; // Set of all encodings used for pages in this column chunk. This information can be used to // determine if all data pages are dictionary encoded for example. - cuda::std::optional> encoding_stats; + std::optional> encoding_stats; // Optional statistics to help estimate total memory when converted to in-memory representations. // The histograms contained in these statistics can also be useful in some cases for more // fine-grained nullability/list length filter pushdown. - cuda::std::optional size_statistics; + std::optional size_statistics; }; /** @@ -429,13 +427,13 @@ struct RowGroup { int64_t num_rows = 0; // If set, specifies a sort ordering of the rows in this RowGroup. // The sorting columns can be a subset of all the columns. - cuda::std::optional> sorting_columns; + std::optional> sorting_columns; // Byte offset from beginning of file to first page (data or dictionary) in this row group - cuda::std::optional file_offset; + std::optional file_offset; // Total byte size of all compressed (and potentially encrypted) column data in this row group - cuda::std::optional total_compressed_size; + std::optional total_compressed_size; // Row group ordinal in the file - cuda::std::optional ordinal; + std::optional ordinal; }; /** @@ -460,7 +458,7 @@ struct FileMetaData { std::vector row_groups; std::vector key_value_metadata; std::string created_by = ""; - cuda::std::optional> column_orders; + std::optional> column_orders; }; /** diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 1390339c1ae..ce9d48693ec 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -22,14 +22,13 @@ #include "io/parquet/parquet_common.hpp" #include "io/statistics/statistics.cuh" #include "io/utilities/column_buffer.hpp" -#include "io/utilities/hostdevice_vector.hpp" +#include #include #include #include #include -#include #include #include @@ -221,6 +220,13 @@ enum class decode_kernel_mask { (1 << 9), // Same as above but for nested, fixed-width data FIXED_WIDTH_NO_DICT_NESTED = (1 << 10), // Run decode kernel for fixed width non-dictionary pages FIXED_WIDTH_DICT_NESTED = (1 << 11), // Run decode kernel for fixed width dictionary pages + FIXED_WIDTH_DICT_LIST = (1 << 12), // Run decode kernel for fixed width dictionary pages + FIXED_WIDTH_NO_DICT_LIST = (1 << 13), // Run decode kernel for fixed width non-dictionary pages + BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = + (1 << 14), // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists + BOOLEAN = (1 << 15), // Run decode kernel for boolean data + BOOLEAN_NESTED = (1 << 16), // Run decode kernel for nested boolean data + BOOLEAN_LIST = (1 << 17), // Run decode kernel for list boolean data }; // mask representing all the ways in which a string can be encoded @@ -294,7 +300,8 @@ struct PageInfo { int32_t uncompressed_page_size; // uncompressed data size in bytes // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length // indicator. instead the lengths for these are stored in the header. - int32_t lvl_bytes[level_type::NUM_LEVEL_TYPES]; // length of the rep/def levels (V2 header) + int32_t // NOLINT + lvl_bytes[level_type::NUM_LEVEL_TYPES]; // length of the rep/def levels (V2 header) // Number of values in this data page or dictionary. // Important : the # of input values does not necessarily // correspond to the number of rows in the output. It just reflects the number @@ -303,8 +310,10 @@ struct PageInfo { // - In the case of a nested schema, you have to decode the repetition and definition // levels to extract actual column values int32_t num_input_values; - int32_t chunk_row; // starting row of this page relative to the start of the chunk - int32_t num_rows; // number of rows in this page + int32_t chunk_row; // starting row of this page relative to the start of the chunk + int32_t num_rows; // number of rows in this page + bool is_num_rows_adjusted; // Flag to indicate if the number of rows of this page have been + // adjusted to compensate for the list row size estimates. // the next four are calculated in gpuComputePageStringSizes int32_t num_nulls; // number of null values (V2 header), but recalculated for string cols int32_t num_valids; // number of non-null values, taking into account skip_rows/num_rows @@ -345,7 +354,7 @@ struct PageInfo { PageNestingDecodeInfo* nesting_decode; // level decode buffers - uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; + uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; // NOLINT // temporary space for decoding DELTA_BYTE_ARRAY encoded strings int64_t temp_string_size; @@ -394,7 +403,7 @@ struct ColumnChunkDesc { uint8_t def_level_bits_, uint8_t rep_level_bits_, Compression codec_, - cuda::std::optional logical_type_, + std::optional logical_type_, int32_t ts_clock_rate_, int32_t src_col_index_, int32_t src_col_schema_, @@ -431,21 +440,21 @@ struct ColumnChunkDesc { size_t num_values{}; // total number of values in this column size_t start_row{}; // file-wide, absolute starting row of this chunk uint32_t num_rows{}; // number of rows in this chunk - int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level - int16_t max_nesting_depth{}; // max nesting depth of the output - int32_t type_length{}; // type length from schema (for FLBA only) - Type physical_type{}; // parquet physical data type - uint8_t - level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels - int32_t num_data_pages{}; // number of data pages - int32_t num_dict_pages{}; // number of dictionary pages + int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level // NOLINT + int16_t max_nesting_depth{}; // max nesting depth of the output + int32_t type_length{}; // type length from schema (for FLBA only) + Type physical_type{}; // parquet physical data type + uint8_t level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max // NOLINT + // definition/repetition levels + int32_t num_data_pages{}; // number of data pages + int32_t num_dict_pages{}; // number of dictionary pages PageInfo const* dict_page{}; - string_index_pair* str_dict_index{}; // index for string dictionary - bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column - void** column_data_base{}; // base pointers of column data - void** column_string_base{}; // base pointers of column string data - Compression codec{}; // compressed codec enum - cuda::std::optional logical_type{}; // logical type + string_index_pair* str_dict_index{}; // index for string dictionary + bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column + void** column_data_base{}; // base pointers of column data + void** column_string_base{}; // base pointers of column string data + Compression codec{}; // compressed codec enum + std::optional logical_type{}; // logical type int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index{}; // my input column index @@ -535,7 +544,7 @@ enum class encode_kernel_mask { DELTA_BINARY = (1 << 2), // Run DELTA_BINARY_PACKED encoding kernel DELTA_LENGTH_BA = (1 << 3), // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel DELTA_BYTE_ARRAY = (1 << 4), // Run DELTA_BYtE_ARRAY encoding kernel - BYTE_STREAM_SPLIT = (1 << 5), // Run plain encoding kernel, but split streams + BYTE_STREAM_SPLIT = (1 << 5) // Run plain encoding kernel, but split streams }; /** @@ -796,6 +805,18 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span pages, kernel_error::pointer error_code, rmm::cuda_stream_view stream); +/** + * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a + * batched manner. + * + * @param offsets Host span of final offsets + * @param buff_addrs Host span of corresponding output col buffer end addresses + * @param stream CUDA stream to use + */ +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for reading the string column data stored in the pages * @@ -895,66 +916,18 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, * @param[in] num_rows Total number of rows to read * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. - * @param[out] error_code Error code for kernel failures - * @param[in] stream CUDA stream to use - */ -void DecodePageDataFixed(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); - -/** - * @brief Launches kernel for reading dictionary fixed width column data stored in the pages - * - * The page data will be written to the output pointed to in the page's - * associated column chunk. - * - * @param[in,out] pages All pages to be decoded - * @param[in] chunks All chunks to be decoded - * @param[in] num_rows Total number of rows to read - * @param[in] min_row Minimum number of rows to read - * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. + * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); - -/** - * @brief Launches kernel for reading fixed width column data stored in the pages - * - * The page data will be written to the output pointed to in the page's - * associated column chunk. - * - * @param[in,out] pages All pages to be decoded - * @param[in] chunks All chunks to be decoded - * @param[in] num_rows Total number of rows to read - * @param[in] min_row Minimum number of rows to read - * @param[in] level_type_size Size in bytes of the type for level decoding - * @param[in] has_nesting Whether or not the data contains nested (but not list) data. - * @param[out] error_code Error code for kernel failures - * @param[in] stream CUDA stream to use - */ -void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, - cudf::detail::hostdevice_span chunks, - std::size_t num_rows, - size_t min_row, - int level_type_size, - bool has_nesting, - kernel_error::pointer error_code, - rmm::cuda_stream_view stream); +void DecodePageData(cudf::detail::hostdevice_span pages, + cudf::detail::hostdevice_span chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + decode_kernel_mask kernel_mask, + kernel_error::pointer error_code, + rmm::cuda_stream_view stream); /** * @brief Launches kernel for initializing encoder row group fragments diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index b90ca36c8c7..cd3dcd2bce4 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -23,9 +23,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -152,7 +152,7 @@ struct stats_caster { } void set_index(size_type index, - cuda::std::optional> const& binary_value, + std::optional> const& binary_value, Type const type) { if (binary_value.has_value()) { @@ -234,8 +234,8 @@ struct stats_caster { max.set_index(stats_idx, max_value, colchunk.meta_data.type); } else { // Marking it null, if column present in row group - min.set_index(stats_idx, cuda::std::nullopt, {}); - max.set_index(stats_idx, cuda::std::nullopt, {}); + min.set_index(stats_idx, std::nullopt, {}); + max.set_index(stats_idx, std::nullopt, {}); } stats_idx++; } @@ -374,7 +374,7 @@ class stats_expression_converter : public ast::detail::expression_transformer { private: std::vector> visit_operands( - std::vector> operands) + cudf::host_span const> operands) { std::vector> transformed_operands; for (auto const& operand : operands) { @@ -454,15 +454,18 @@ std::optional>> aggregate_reader_metadata::fi CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8, "Filter expression must return a boolean column"); - auto num_bitmasks = num_bitmask_words(predicate.size()); - std::vector host_bitmask(num_bitmasks, ~bitmask_type{0}); - if (predicate.nullable()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(host_bitmask.data(), - predicate.null_mask(), - num_bitmasks * sizeof(bitmask_type), - cudaMemcpyDefault, - stream.value())); - } + auto const host_bitmask = [&] { + auto const num_bitmasks = num_bitmask_words(predicate.size()); + if (predicate.nullable()) { + return cudf::detail::make_host_vector_sync( + device_span(predicate.null_mask(), num_bitmasks), stream); + } else { + auto bitmask = cudf::detail::make_host_vector(num_bitmasks, stream); + std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0}); + return bitmask; + } + }(); + auto validity_it = cudf::detail::make_counting_transform_iterator( 0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); }); @@ -551,7 +554,7 @@ std::reference_wrapper named_to_reference_converter::visi std::vector> named_to_reference_converter::visit_operands( - std::vector> operands) + cudf::host_span const> operands) { std::vector> transformed_operands; for (auto const& operand : operands) { @@ -621,7 +624,7 @@ class names_from_expression : public ast::detail::expression_transformer { } private: - void visit_operands(std::vector> operands) + void visit_operands(cudf::host_span const> operands) { for (auto const& operand : operands) { operand.get().accept(*this); diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index dd354b905f3..170c6e8857f 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -16,8 +16,6 @@ #include "reader_impl.hpp" -#include - namespace cudf::io::parquet::detail { reader::reader() = default; diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 7d817bde7af..d74ae83b635 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -21,11 +21,9 @@ #include #include #include -#include #include #include -#include #include #include @@ -38,7 +36,7 @@ namespace { // be treated as a string. Currently the only logical type that has special handling is DECIMAL. // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which // for now would also be treated as a string). -inline bool is_treat_fixed_length_as_string(cuda::std::optional const& logical_type) +inline bool is_treat_fixed_length_as_string(std::optional const& logical_type) { if (!logical_type.has_value()) { return true; } return logical_type->type != LogicalType::DECIMAL; @@ -78,7 +76,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). auto const has_strings = (kernel_mask & STRINGS_MASK) != 0; - std::vector col_string_sizes(_input_columns.size(), 0L); + auto col_string_sizes = cudf::detail::make_host_vector(_input_columns.size(), _stream); if (has_strings) { // need to compute pages bounds/sizes if we lack page indexes or are using custom bounds // TODO: we could probably dummy up size stats for FLBA data since we know the width @@ -99,22 +97,37 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num _stream); } + // Compute column string sizes (using page string offsets) for this subpass col_string_sizes = calculate_page_string_offsets(); - // check for overflow + // ensure cumulative column string sizes have been initialized + if (pass.cumulative_col_string_sizes.empty()) { + pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0); + } + + // Add to the cumulative column string sizes of this pass + std::transform(pass.cumulative_col_string_sizes.begin(), + pass.cumulative_col_string_sizes.end(), + col_string_sizes.begin(), + pass.cumulative_col_string_sizes.begin(), + std::plus<>{}); + + // Check for overflow in cumulative column string sizes of this pass so that the page string + // offsets of overflowing (large) string columns are treated as 64-bit. auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - auto const has_large_strings = std::any_of(col_string_sizes.cbegin(), - col_string_sizes.cend(), + auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(), + pass.cumulative_col_string_sizes.cend(), [=](std::size_t sz) { return sz > threshold; }); if (has_large_strings and not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } - // mark any chunks that are large string columns + // Mark any chunks for which the cumulative column string size has exceeded the + // large strings threshold if (has_large_strings) { for (auto& chunk : pass.chunks) { auto const idx = chunk.src_col_index; - if (col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } + if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } } } } @@ -197,7 +210,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // only do string buffer for leaf if (idx == max_depth - 1 and out_buf.string_size() == 0 and col_string_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], _stream); + out_buf.create_string_data( + col_string_sizes[pass.chunks[c].src_col_index], + pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] > + static_cast(strings::detail::get_offset64_threshold()), + _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= @@ -221,8 +238,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num int const nkernels = std::bitset<32>(kernel_mask).count(); auto streams = cudf::detail::fork_streams(_stream, nkernels); - // launch string decoder int s_idx = 0; + + auto decode_data = [&](decode_kernel_mask decoder_mask) { + DecodePageData(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + decoder_mask, + error_code.data(), + streams[s_idx++]); + }; + + // launch string decoder if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) { DecodeStringPageData(subpass.pages, pass.chunks, @@ -268,26 +297,17 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // launch byte stream split decoder if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) != 0) { - DecodeSplitPageFixedWidthData(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT); } // launch byte stream split decoder, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) != 0) { - DecodeSplitPageFixedWidthData(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED); + } + + // launch byte stream split decoder, for list columns + if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) { + decode_data(decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST); } // launch byte stream split decoder @@ -303,50 +323,47 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // launch fixed width type decoder if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) { - DecodePageDataFixed(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT); + } + + // launch fixed width type decoder for lists + if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) { + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST); } // launch fixed width type decoder, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) != 0) { - DecodePageDataFixed(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED); + } + + // launch boolean type decoder + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN) != 0) { + decode_data(decode_kernel_mask::BOOLEAN); + } + + // launch boolean type decoder, for nested columns + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN_NESTED) != 0) { + decode_data(decode_kernel_mask::BOOLEAN_NESTED); + } + + // launch boolean type decoder, for nested columns + if (BitAnd(kernel_mask, decode_kernel_mask::BOOLEAN_LIST) != 0) { + decode_data(decode_kernel_mask::BOOLEAN_LIST); } // launch fixed width type decoder with dictionaries if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT) != 0) { - DecodePageDataFixedDict(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - false, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT); + } + + // launch fixed width type decoder with dictionaries for lists + if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) { + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT_LIST); } // launch fixed width type decoder with dictionaries, for nested columns if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) != 0) { - DecodePageDataFixedDict(subpass.pages, - pass.chunks, - num_rows, - skip_rows, - level_type_size, - true, - error_code.data(), - streams[s_idx++]); + decode_data(decode_kernel_mask::FIXED_WIDTH_DICT_NESTED); } // launch the catch-all page decoder @@ -371,13 +388,15 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } - // for list columns, add the final offset to every offset buffer. - // TODO : make this happen in more efficiently. Maybe use thrust::for_each - // on each buffer. + // For list and string columns, add the final offset to every offset buffer. // Note : the reason we are doing this here instead of in the decode kernel is // that it is difficult/impossible for a given page to know that it is writing the very // last value that should then be followed by a terminator (because rows can span // page boundaries). + std::vector out_buffers; + std::vector final_offsets; + out_buffers.reserve(_input_columns.size()); + final_offsets.reserve(_input_columns.size()); for (size_t idx = 0; idx < _input_columns.size(); idx++) { input_column_info const& input_col = _input_columns[idx]; @@ -393,25 +412,21 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // the final offset for a list at level N is the size of it's child size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), - &offset, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + (out_buf.size - 1)); + final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column auto const sz = static_cast(col_string_sizes[idx]); if (sz <= strings::detail::get_offset64_threshold()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, - &sz, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); + final_offsets.emplace_back(sz); } } } } + // Write the final offsets for list and string columns in a batched manner + WriteFinalOffsets(final_offsets, out_buffers, _stream); // update null counts in the final column buffers for (size_t idx = 0; idx < subpass.pages.size(); idx++) { diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 2d46da14bec..3aa9b94ed6b 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -188,10 +188,10 @@ class reader::impl { * * Does not decompress the chunk data. * - * @return pair of boolean indicating if compressed chunks were found and a vector of futures for + * @return pair of boolean indicating if compressed chunks were found and a future for * read completion */ - std::pair>> read_column_chunks(); + std::pair> read_column_chunks(); /** * @brief Read compressed data and page information for the current pass. @@ -284,7 +284,7 @@ class reader::impl { * * @return Vector of total string data sizes for each column */ - std::vector calculate_page_string_offsets(); + cudf::detail::host_vector calculate_page_string_offsets(); /** * @brief Converts the page data and outputs to columns. diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index c588fedb85c..27312a4da89 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos, * * @return A tuple of Parquet clock rate and Parquet decimal type. */ -[[nodiscard]] std::tuple> conversion_info( +[[nodiscard]] std::tuple> conversion_info( type_id column_type_id, type_id timestamp_type_id, Type physical, - cuda::std::optional logical_type) + std::optional logical_type) { int32_t const clock_rate = is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0; @@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos, // if decimal but not outputting as float or decimal, then convert to no logical type if (column_type_id != type_id::FLOAT64 and not cudf::is_fixed_point(data_type{column_type_id})) { - return std::make_tuple(clock_rate, cuda::std::nullopt); + return {clock_rate, std::nullopt}; } } - return std::make_tuple(clock_rate, std::move(logical_type)); + return {clock_rate, std::move(logical_type)}; } /** diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index 3a3cdd34a58..ca46f198bb8 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -107,7 +107,7 @@ struct subpass_intermediate_data { * rowgroups may represent less than all of the rowgroups to be read for the file. */ struct pass_intermediate_data { - std::vector> raw_page_data; + std::vector raw_page_data; // rowgroup, chunk and page information for the current pass. bool has_compressed_data{false}; @@ -130,6 +130,9 @@ struct pass_intermediate_data { rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; + // cumulative strings column sizes. + std::vector cumulative_col_string_sizes{}; + int level_type_size{0}; // skip_rows / num_rows for this pass. diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 6d566b5815e..a6562d33de2 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf; namespace { -cuda::std::optional converted_to_logical_type(SchemaElement const& schema) +std::optional converted_to_logical_type(SchemaElement const& schema) { if (schema.converted_type.has_value()) { switch (schema.converted_type.value()) { @@ -66,7 +66,7 @@ cuda::std::optional converted_to_logical_type(SchemaElement const& default: return LogicalType{LogicalType::UNDEFINED}; } } - return cuda::std::nullopt; + return std::nullopt; } } // namespace @@ -246,7 +246,7 @@ void metadata::sanitize_schema() struct_elem.repetition_type = REQUIRED; struct_elem.num_children = schema_elem.num_children; struct_elem.type = UNDEFINED_TYPE; - struct_elem.converted_type = cuda::std::nullopt; + struct_elem.converted_type = std::nullopt; // swap children struct_elem.children_idx = std::move(schema_elem.children_idx); diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 6487c92f48f..fd692c0cdd6 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -425,7 +425,7 @@ class named_to_reference_converter : public ast::detail::expression_transformer private: std::vector> visit_operands( - std::vector> operands); + cudf::host_span const> operands); std::unordered_map column_name_to_index; std::optional> _stats_expr; diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 8e67f233213..bcdae4cbd3b 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -19,9 +19,9 @@ #include #include +#include #include #include -#include #include #include @@ -44,6 +44,7 @@ #include #include +#include #include namespace cudf::io::parquet::detail { @@ -217,7 +218,7 @@ void generate_depth_remappings( */ [[nodiscard]] std::future read_column_chunks_async( std::vector> const& sources, - std::vector>& page_data, + cudf::host_span page_data, cudf::detail::hostdevice_vector& chunks, size_t begin_chunk, size_t end_chunk, @@ -250,23 +251,24 @@ void generate_depth_remappings( if (source->is_device_read_preferred(io_size)) { // Buffer needs to be padded. // Required by `gpuDecodePageData`. - auto buffer = + page_data[chunk] = rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream); auto fut_read_size = source->device_read_async( - io_offset, io_size, static_cast(buffer.data()), stream); + io_offset, io_size, static_cast(page_data[chunk].data()), stream); read_tasks.emplace_back(std::move(fut_read_size)); - page_data[chunk] = datasource::buffer::create(std::move(buffer)); } else { auto const read_buffer = source->host_read(io_offset, io_size); // Buffer needs to be padded. // Required by `gpuDecodePageData`. - auto tmp_buffer = rmm::device_buffer( + page_data[chunk] = rmm::device_buffer( cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync( - tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream)); - page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer)); + CUDF_CUDA_TRY(cudaMemcpyAsync(page_data[chunk].data(), + read_buffer->data(), + read_buffer->size(), + cudaMemcpyDefault, + stream)); } - auto d_compdata = page_data[chunk]->data(); + auto d_compdata = static_cast(page_data[chunk].data()); do { chunks[chunk].compressed_data = d_compdata; d_compdata += chunks[chunk].compressed_size; @@ -727,7 +729,10 @@ struct set_final_row_count { if (i < pages.size() - 1 && (pages[i + 1].chunk_idx == page.chunk_idx)) { return; } size_t const page_start_row = chunk.start_row + page.chunk_row; size_t const chunk_last_row = chunk.start_row + chunk.num_rows; - page.num_rows = chunk_last_row - page_start_row; + // Mark `is_num_rows_adjusted` to signal string decoders that the `num_rows` of this page has + // been adjusted. + page.is_num_rows_adjusted = page.num_rows != (chunk_last_row - page_start_row); + page.num_rows = chunk_last_row - page_start_row; } }; @@ -964,7 +969,7 @@ void reader::impl::allocate_level_decode_space() } } -std::pair>> reader::impl::read_column_chunks() +std::pair> reader::impl::read_column_chunks() { auto const& row_groups_info = _pass_itm_data->row_groups; @@ -979,7 +984,7 @@ std::pair>> reader::impl::read_column_chunks std::vector chunk_source_map(num_chunks); // Tracker for eventually deallocating compressed and uncompressed data - raw_page_data = std::vector>(num_chunks); + raw_page_data = std::vector(num_chunks); // Keep track of column chunk file offsets std::vector column_chunk_offsets(num_chunks); @@ -989,7 +994,6 @@ std::pair>> reader::impl::read_column_chunks // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide // skip_rows/num_rows // auto remaining_rows = num_rows; - std::vector> read_chunk_tasks; size_type chunk_count = 0; for (auto const& rg : row_groups_info) { auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); @@ -1018,16 +1022,15 @@ std::pair>> reader::impl::read_column_chunks } // Read compressed chunk data to device memory - read_chunk_tasks.push_back(read_column_chunks_async(_sources, - raw_page_data, - chunks, - 0, - chunks.size(), - column_chunk_offsets, - chunk_source_map, - _stream)); - - return {total_decompressed_size > 0, std::move(read_chunk_tasks)}; + return {total_decompressed_size > 0, + read_column_chunks_async(_sources, + raw_page_data, + chunks, + 0, + chunks.size(), + column_chunk_offsets, + chunk_source_map, + _stream)}; } void reader::impl::read_compressed_data() @@ -1042,9 +1045,7 @@ void reader::impl::read_compressed_data() auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks(); pass.has_compressed_data = has_compressed_data; - for (auto& task : read_chunks_tasks) { - task.wait(); - } + read_chunks_tasks.wait(); // Process dataset chunk pages into output columns auto const total_pages = _has_page_index ? count_page_headers_with_pgidx(chunks, _stream) @@ -1596,36 +1597,68 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num auto const d_cols_info = cudf::detail::make_device_uvector_async( h_cols_info, _stream, cudf::get_current_device_resource_ref()); - auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size(); - // size iterator. indexes pages by sorted order - rmm::device_uvector size_input{num_keys, _stream}; - thrust::transform( - rmm::exec_policy(_stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_keys), - size_input.begin(), - get_page_nesting_size{ - d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()}); - auto const reduction_keys = - cudf::detail::make_counting_transform_iterator(0, get_reduction_key{subpass.pages.size()}); + // Vector to store page sizes for each column at each depth cudf::detail::hostdevice_vector sizes{_input_columns.size() * max_depth, _stream}; - // find the size of each column - thrust::reduce_by_key(rmm::exec_policy(_stream), - reduction_keys, - reduction_keys + num_keys, - size_input.cbegin(), - thrust::make_discard_iterator(), - sizes.d_begin()); - - // for nested hierarchies, compute per-page start offset - thrust::exclusive_scan_by_key( - rmm::exec_policy(_stream), - reduction_keys, - reduction_keys + num_keys, - size_input.cbegin(), - start_offset_output_iterator{ - subpass.pages.device_begin(), 0, d_cols_info.data(), max_depth, subpass.pages.size()}); + // Total number of keys to process + auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size(); + + // Maximum 1 billion keys processed per iteration + auto constexpr max_keys_per_iter = + static_cast(std::numeric_limits::max() / 2); + + // Number of keys for per each column + auto const num_keys_per_col = max_depth * subpass.pages.size(); + + // The largest multiple of `num_keys_per_col` that is <= `num_keys` + auto const num_keys_per_iter = + num_keys <= max_keys_per_iter + ? num_keys + : num_keys_per_col * std::max(1, max_keys_per_iter / num_keys_per_col); + + // Size iterator. Indexes pages by sorted order + rmm::device_uvector size_input{num_keys_per_iter, _stream}; + + // To keep track of the starting key of an iteration + size_t key_start = 0; + // Loop until all keys are processed + while (key_start < num_keys) { + // Number of keys processed in this iteration + auto const num_keys_this_iter = std::min(num_keys_per_iter, num_keys - key_start); + thrust::transform( + rmm::exec_policy_nosync(_stream), + thrust::make_counting_iterator(key_start), + thrust::make_counting_iterator(key_start + num_keys_this_iter), + size_input.begin(), + get_page_nesting_size{ + d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()}); + + // Manually create a size_t `key_start` compatible counting_transform_iterator. + auto const reduction_keys = + thrust::make_transform_iterator(thrust::make_counting_iterator(key_start), + get_reduction_key{subpass.pages.size()}); + + // Find the size of each column + thrust::reduce_by_key(rmm::exec_policy_nosync(_stream), + reduction_keys, + reduction_keys + num_keys_this_iter, + size_input.cbegin(), + thrust::make_discard_iterator(), + sizes.d_begin() + (key_start / subpass.pages.size())); + + // For nested hierarchies, compute per-page start offset + thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream), + reduction_keys, + reduction_keys + num_keys_this_iter, + size_input.cbegin(), + start_offset_output_iterator{subpass.pages.device_begin(), + key_start, + d_cols_info.data(), + max_depth, + subpass.pages.size()}); + // Increment the key_start + key_start += num_keys_this_iter; + } sizes.device_to_host_sync(_stream); for (size_type idx = 0; idx < static_cast(_input_columns.size()); idx++) { @@ -1660,21 +1693,20 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num } } - cudf::io::detail::batched_memset(memset_bufs, static_cast(0), _stream); + cudf::detail::batched_memset(memset_bufs, static_cast(0), _stream); // Need to set null mask bufs to all high bits - cudf::io::detail::batched_memset( + cudf::detail::batched_memset( nullmask_bufs, std::numeric_limits::max(), _stream); } -std::vector reader::impl::calculate_page_string_offsets() +cudf::detail::host_vector reader::impl::calculate_page_string_offsets() { auto& pass = *_pass_itm_data; auto& subpass = *pass.subpass; auto page_keys = make_page_key_iterator(subpass.pages); - std::vector col_sizes(_input_columns.size(), 0L); - rmm::device_uvector d_col_sizes(col_sizes.size(), _stream); + rmm::device_uvector d_col_sizes(_input_columns.size(), _stream); // use page_index to fetch page string sizes in the proper order auto val_iter = thrust::make_transform_iterator(subpass.pages.device_begin(), @@ -1688,7 +1720,7 @@ std::vector reader::impl::calculate_page_string_offsets() page_offset_output_iter{subpass.pages.device_ptr()}); // now sum up page sizes - rmm::device_uvector reduce_keys(col_sizes.size(), _stream); + rmm::device_uvector reduce_keys(d_col_sizes.size(), _stream); thrust::reduce_by_key(rmm::exec_policy_nosync(_stream), page_keys, page_keys + subpass.pages.size(), @@ -1696,14 +1728,7 @@ std::vector reader::impl::calculate_page_string_offsets() reduce_keys.begin(), d_col_sizes.begin()); - cudaMemcpyAsync(col_sizes.data(), - d_col_sizes.data(), - sizeof(size_t) * col_sizes.size(), - cudaMemcpyDeviceToHost, - _stream); - _stream.synchronize(); - - return col_sizes; + return cudf::detail::make_host_vector_sync(d_col_sizes, _stream); } } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 4a0791d5c54..3c49de0c997 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -19,6 +19,7 @@ #include "parquet_gpu.hpp" #include +#include namespace cudf::io::parquet::detail { @@ -151,7 +152,6 @@ __device__ inline void decode(level_t* const output, } // a single rle run. may be broken up into multiple rle_batches -template struct rle_run { int size; // total size of the run int output_pos; // absolute position of this run w.r.t output @@ -182,14 +182,14 @@ struct rle_stream { level_t* output; - rle_run* runs; + rle_run* runs; int output_pos; int fill_index; int decode_index; - __device__ rle_stream(rle_run* _runs) : runs(_runs) {} + __device__ rle_stream(rle_run* _runs) : runs(_runs) {} __device__ inline bool is_last_decode_warp(int warp_id) { @@ -216,6 +216,26 @@ struct rle_stream { decode_index = -1; // signals the first iteration. Nothing to decode. } + __device__ inline int get_rle_run_info(rle_run& run) + { + run.start = cur; + run.level_run = get_vlq32(run.start, end); + + // run_bytes includes the header size + int run_bytes = run.start - cur; + if (is_literal_run(run.level_run)) { + // from the parquet spec: literal runs always come in multiples of 8 values. + run.size = (run.level_run >> 1) * 8; + run_bytes += util::div_rounding_up_unsafe(run.size * level_bits, 8); + } else { + // repeated value run + run.size = (run.level_run >> 1); + run_bytes += util::div_rounding_up_unsafe(level_bits, 8); + } + + return run_bytes; + } + __device__ inline void fill_run_batch() { // decode_index == -1 means we are on the very first decode iteration for this stream. @@ -226,31 +246,14 @@ struct rle_stream { while (((decode_index == -1 && fill_index < num_rle_stream_decode_warps) || fill_index < decode_index + run_buffer_size) && cur < end) { - auto& run = runs[rolling_index(fill_index)]; - // Encoding::RLE + // Pass by reference to fill the runs shared memory with the run data + auto& run = runs[rolling_index(fill_index)]; + int const run_bytes = get_rle_run_info(run); - // bytes for the varint header - uint8_t const* _cur = cur; - int const level_run = get_vlq32(_cur, end); - // run_bytes includes the header size - int run_bytes = _cur - cur; - - // literal run - if (is_literal_run(level_run)) { - // from the parquet spec: literal runs always come in multiples of 8 values. - run.size = (level_run >> 1) * 8; - run_bytes += ((run.size * level_bits) + 7) >> 3; - } - // repeated value run - else { - run.size = (level_run >> 1); - run_bytes += ((level_bits) + 7) >> 3; - } - run.output_pos = output_pos; - run.start = _cur; - run.level_run = level_run; run.remaining = run.size; + run.output_pos = output_pos; + cur += run_bytes; output_pos += run.size; fill_index++; @@ -372,6 +375,39 @@ struct rle_stream { return values_processed_shared; } + __device__ inline int skip_runs(int target_count) + { + // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip + // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info. + // then when it hits the one that matters, we don't process it at all and bail as if we never + // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for + // the first time + while (cur < end) { + rle_run run; + int run_bytes = get_rle_run_info(run); + + if ((output_pos + run.size) > target_count) { + return output_pos; // bail! we've reached the starting run + } + + // skip this run + output_pos += run.size; + cur += run_bytes; + } + + return output_pos; // we skipped everything + } + + __device__ inline int skip_decode(int t, int count) + { + int const output_count = min(count, total_values - cur_values); + + // if level_bits == 0, there's nothing to do + // a very common case: columns with no nulls, especially if they are non-nested + cur_values = (level_bits == 0) ? output_count : skip_runs(output_count); + return cur_values; + } + __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); } }; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ec05f35d405..f865c9a7643 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -183,10 +183,10 @@ struct aggregate_writer_metadata { std::vector row_groups; std::vector key_value_metadata; std::vector offset_indexes; - std::vector> column_indexes; + std::vector> column_indexes; }; std::vector files; - cuda::std::optional> column_orders = cuda::std::nullopt; + std::optional> column_orders = std::nullopt; }; namespace { @@ -472,7 +472,7 @@ struct leaf_schema_fn { std::enable_if_t, void> operator()() { col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = cuda::std::nullopt; + col_schema.converted_type = std::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; if (timestamp_is_int96) { col_schema.ts_scale = -1000; // negative value indicates division by absolute value @@ -750,7 +750,7 @@ std::vector construct_parquet_schema_tree( col_schema.type = Type::BYTE_ARRAY; } - col_schema.converted_type = cuda::std::nullopt; + col_schema.converted_type = std::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_byte_array; col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -1543,12 +1543,7 @@ void encode_pages(hostdevice_2dvector& chunks, d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream); } - auto h_chunks = chunks.host_view(); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(), - d_chunks.data(), - d_chunks.flat_view().size_bytes(), - cudaMemcpyDefault, - stream.value())); + chunks.device_to_host_async(stream); if (comp_stats.has_value()) { comp_stats.value() += collect_compression_statistics(comp_in, comp_res, stream); @@ -2559,12 +2554,11 @@ void writer::impl::write_parquet_data_to_sink( } else { CUDF_EXPECTS(bounce_buffer.size() >= ck.compressed_size, "Bounce buffer was not properly initialized."); - CUDF_CUDA_TRY(cudaMemcpyAsync(bounce_buffer.data(), - dev_bfr + ck.ck_stat_size, - ck.compressed_size, - cudaMemcpyDefault, - _stream.value())); - _stream.synchronize(); + cudf::detail::cuda_memcpy( + host_span{bounce_buffer}.subspan(0, ck.compressed_size), + device_span{dev_bfr + ck.ck_stat_size, ck.compressed_size}, + _stream); + _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size); } @@ -2600,13 +2594,8 @@ void writer::impl::write_parquet_data_to_sink( auto const& column_chunk_meta = row_group.columns[i].meta_data; // start transfer of the column index - std::vector column_idx; - column_idx.resize(ck.column_index_size); - CUDF_CUDA_TRY(cudaMemcpyAsync(column_idx.data(), - ck.column_index_blob, - ck.column_index_size, - cudaMemcpyDefault, - _stream.value())); + auto column_idx = cudf::detail::make_host_vector_async( + device_span{ck.column_index_blob, ck.column_index_size}, _stream); // calculate offsets while the column index is transferring int64_t curr_pg_offset = column_chunk_meta.data_page_offset; @@ -2795,7 +2784,7 @@ std::unique_ptr> writer::merge_row_group_metadata( // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615 for (auto& se : md.schema) { if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) { - se.logical_type = cuda::std::nullopt; + se.logical_type = std::nullopt; } } diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index badcd3f58f9..06069630685 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -74,8 +74,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader { // Buffer needs to be padded. // Required by `inflate_kernel`. device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync( - device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{device}.subspan(0, host.size()), host, stream); } struct decompression_blocks { diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 58faa0ebfe4..f4a2f29026a 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -22,10 +22,6 @@ #include #include -#include - -#include - #include namespace cudf::io::text { @@ -87,8 +83,10 @@ class datasource_chunk_reader : public data_chunk_reader { _source->host_read(_offset, read_size, reinterpret_cast(h_ticket.buffer.data())); // copy the host-pinned data on to device - CUDF_CUDA_TRY(cudaMemcpyAsync( - chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{chunk}.subspan(0, read_size), + host_span{h_ticket.buffer}.subspan(0, read_size), + stream); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); @@ -153,8 +151,10 @@ class istream_data_chunk_reader : public data_chunk_reader { auto chunk = rmm::device_uvector(read_size, stream); // copy the host-pinned data on to device - CUDF_CUDA_TRY(cudaMemcpyAsync( - chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{chunk}.subspan(0, read_size), + host_span{h_ticket.buffer}.subspan(0, read_size), + stream); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); @@ -193,12 +193,10 @@ class host_span_data_chunk_reader : public data_chunk_reader { auto chunk = rmm::device_uvector(read_size, stream); // copy the host data to device - CUDF_CUDA_TRY(cudaMemcpyAsync( // - chunk.data(), - _data.data() + _position, - read_size, - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async( + cudf::device_span{chunk}.subspan(0, read_size), + cudf::host_span{_data}.subspan(_position, read_size), + stream); _position += read_size; diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 249dc3b5875..41ed55cd090 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -21,12 +21,12 @@ #include "column_buffer.hpp" +#include #include #include #include #include -#include #include namespace cudf::io::detail { @@ -63,9 +63,11 @@ void cudf::io::detail::inline_column_buffer::allocate_strings_data(bool memset_d } void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes, + bool is_large_strings_col, rmm::cuda_stream_view stream) { - _string_data = rmm::device_buffer(num_bytes, stream, _mr); + _is_large_strings_col = is_large_strings_col; + _string_data = rmm::device_buffer(num_bytes, stream, _mr); } namespace { diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index e73b2bc88de..da19539f509 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -22,12 +22,9 @@ #pragma once #include -#include #include -#include #include #include -#include #include #include @@ -35,6 +32,8 @@ #include +#include + namespace cudf { namespace io { namespace detail { @@ -247,13 +246,17 @@ class inline_column_buffer : public column_buffer_base { [[nodiscard]] size_t data_size_impl() const { return _data.size(); } std::unique_ptr make_string_column_impl(rmm::cuda_stream_view stream); - void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream); + void create_string_data(size_t num_bytes, + bool is_large_strings_col, + rmm::cuda_stream_view stream); void* string_data() { return _string_data.data(); } [[nodiscard]] void const* string_data() const { return _string_data.data(); } [[nodiscard]] size_t string_size() const { return _string_data.size(); } + [[nodiscard]] bool is_large_strings_column() const { return _is_large_strings_col; } private: rmm::device_buffer _string_data{}; + bool _is_large_strings_col{}; }; using column_buffer = gather_column_buffer; diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu index 4bc303a34a5..66d0a644c12 100644 --- a/cpp/src/io/utilities/column_buffer_strings.cu +++ b/cpp/src/io/utilities/column_buffer_strings.cu @@ -27,8 +27,7 @@ std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_colu { // if the size of _string_data is over the threshold for 64bit size_type, _data will contain // sizes rather than offsets. need special handling for that case. - auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - if (_string_data.size() > threshold) { + if (is_large_strings_column()) { if (not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index a3afbd52896..cea0ebad8f5 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -16,11 +16,10 @@ #include "getenv_or.hpp" -#include #include -#include -#include +#include + #include namespace cudf::io { @@ -53,6 +52,18 @@ bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_ bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; } +void set_up_kvikio() +{ + static std::once_flag flag{}; + std::call_once(flag, [] { + auto const compat_mode = + kvikio::detail::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON); + kvikio::defaults::compat_mode_reset(compat_mode); + + auto const nthreads = getenv_or("KVIKIO_NTHREADS", 4u); + kvikio::defaults::thread_pool_nthreads_reset(nthreads); + }); +} } // namespace cufile_integration namespace nvcomp_integration { @@ -81,5 +92,4 @@ bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; } bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; } } // namespace nvcomp_integration - } // namespace cudf::io diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index f70171eef68..0c49b2e5d78 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -800,7 +801,7 @@ template static std::unique_ptr parse_string(string_view_pair_it str_tuples, size_type col_size, rmm::device_buffer&& null_mask, - rmm::device_scalar& d_null_count, + cudf::detail::device_scalar& d_null_count, cudf::io::parse_options_view const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -930,7 +931,7 @@ std::unique_ptr parse_data( CUDF_FUNC_RANGE(); if (col_size == 0) { return make_empty_column(col_type); } - auto d_null_count = rmm::device_scalar(null_count, stream); + auto d_null_count = cudf::detail::device_scalar(null_count, stream); auto null_count_data = d_null_count.data(); if (null_mask.is_empty()) { null_mask = cudf::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr); diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 1dbb9369115..b37a5ac900a 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -42,15 +42,17 @@ class file_sink : public data_sink { if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); } if (cufile_integration::is_kvikio_enabled()) { + cufile_integration::set_up_kvikio(); _kvikio_file = kvikio::FileHandle(filepath, "w"); CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.", - _kvikio_file.is_compat_mode_on() ? "on" : "off"); + _kvikio_file.is_compat_mode_preferred() ? "on" : "off"); } else { _cufile_out = detail::make_cufile_output(filepath); } } - ~file_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~file_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { @@ -70,8 +72,12 @@ class file_sink : public data_sink { [[nodiscard]] bool is_device_write_preferred(size_t size) const override { - if (size < _gds_write_preferred_threshold) { return false; } - return supports_device_write(); + if (!supports_device_write()) { return false; } + + // Always prefer device writes if kvikio is enabled + if (!_kvikio_file.closed()) { return true; } + + return size >= _gds_write_preferred_threshold; } std::future device_write_async(void const* gpu_data, @@ -114,7 +120,8 @@ class host_buffer_sink : public data_sink { public: explicit host_buffer_sink(std::vector* buffer) : buffer_(buffer) {} - ~host_buffer_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~host_buffer_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e4313eba454..10814eea458 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -15,8 +15,10 @@ */ #include "file_io_utilities.hpp" +#include "getenv_or.hpp" #include +#include #include #include #include @@ -31,7 +33,12 @@ #include #include -#include +#include +#include + +#ifdef CUDF_KVIKIO_REMOTE_IO +#include +#endif namespace cudf { namespace io { @@ -46,14 +53,39 @@ class file_source : public datasource { { detail::force_init_cuda_context(); if (cufile_integration::is_kvikio_enabled()) { + cufile_integration::set_up_kvikio(); _kvikio_file = kvikio::FileHandle(filepath); CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", - _kvikio_file.is_compat_mode_on() ? "on" : "off"); + _kvikio_file.is_compat_mode_preferred() ? "on" : "off"); } else { _cufile_in = detail::make_cufile_input(filepath); } } + std::unique_ptr host_read(size_t offset, size_t size) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + ssize_t const read_size = std::min(size, _file.size() - offset); + + std::vector v(read_size); + CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); + return buffer::create(std::move(v)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + auto const read_size = std::min(size, _file.size() - offset); + + CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), + "read failed"); + return read_size; + } + ~file_source() override = default; [[nodiscard]] bool supports_device_read() const override @@ -63,8 +95,12 @@ class file_source : public datasource { [[nodiscard]] bool is_device_read_preferred(size_t size) const override { - if (size < _gds_read_preferred_threshold) { return false; } - return supports_device_read(); + if (!supports_device_read()) { return false; } + + // Always prefer device reads if kvikio is enabled + if (!_kvikio_file.closed()) { return true; } + + return size >= _gds_read_preferred_threshold; } std::future device_read_async(size_t offset, @@ -109,27 +145,6 @@ class file_source : public datasource { static constexpr size_t _gds_read_preferred_threshold = 128 << 10; // 128KB }; -/** - * @brief Memoized pageableMemoryAccessUsesHostPageTables device property. - */ -[[nodiscard]] bool pageableMemoryAccessUsesHostPageTables() -{ - static std::unordered_map result_cache{}; - - int deviceId{}; - CUDF_CUDA_TRY(cudaGetDevice(&deviceId)); - - if (result_cache.find(deviceId) == result_cache.end()) { - cudaDeviceProp props{}; - CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, deviceId)); - result_cache[deviceId] = (props.pageableMemoryAccessUsesHostPageTables == 1); - CUDF_LOG_INFO( - "Device {} pageableMemoryAccessUsesHostPageTables: {}", deviceId, result_cache[deviceId]); - } - - return result_cache[deviceId]; -} - /** * @brief Implementation class for reading from a file using memory mapped access. * @@ -138,40 +153,53 @@ class file_source : public datasource { */ class memory_mapped_source : public file_source { public: - explicit memory_mapped_source(char const* filepath, size_t offset, size_t size) + explicit memory_mapped_source(char const* filepath, size_t offset, size_t max_size_estimate) : file_source(filepath) { if (_file.size() != 0) { - map(_file.desc(), offset, size); - register_mmap_buffer(); + // Memory mapping is not exclusive, so we can include the whole region we expect to read + map(_file.desc(), offset, max_size_estimate); } } ~memory_mapped_source() override { - if (_map_addr != nullptr) { - munmap(_map_addr, _map_size); - unregister_mmap_buffer(); - } + if (_map_addr != nullptr) { unmap(); } } std::unique_ptr host_read(size_t offset, size_t size) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size); + } + + // If the requested range is only partially within the registered region, copy to a new + // host buffer to make the data safe to copy to the device + if (_reg_addr != nullptr and + (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) { + auto const src = static_cast(_map_addr) + (offset - _map_offset); + + return std::make_unique>>( + std::vector(src, src + read_size)); + } return std::make_unique( - static_cast(_map_addr) + (offset - _map_offset), read_size); + static_cast(_map_addr) + offset - _map_offset, read_size); } size_t host_read(size_t offset, size_t size, uint8_t* dst) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size, dst); + } auto const src = static_cast(_map_addr) + (offset - _map_offset); std::memcpy(dst, src, read_size); @@ -179,42 +207,6 @@ class memory_mapped_source : public file_source { } private: - /** - * @brief Page-locks (registers) the memory range of the mapped file. - * - * Fixes nvbugs/4215160 - */ - void register_mmap_buffer() - { - if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) { - return; - } - - auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault); - if (result == cudaSuccess) { - _is_map_registered = true; - } else { - CUDF_LOG_WARN("cudaHostRegister failed with {} ({})", - static_cast(result), - cudaGetErrorString(result)); - } - } - - /** - * @brief Unregisters the memory range of the mapped file. - */ - void unregister_mmap_buffer() - { - if (not _is_map_registered) { return; } - - auto const result = cudaHostUnregister(_map_addr); - if (result != cudaSuccess) { - CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})", - static_cast(result), - cudaGetErrorString(result)); - } - } - void map(int fd, size_t offset, size_t size) { CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error); @@ -226,52 +218,30 @@ class memory_mapped_source : public file_source { // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); + if (_map_size == 0) { return; } // Check if accessing a region within already mapped area _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset); CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping"); } - private: - size_t _map_size = 0; - size_t _map_offset = 0; - void* _map_addr = nullptr; - bool _is_map_registered = false; -}; - -/** - * @brief Implementation class for reading from a file using `read` calls - * - * Potentially faster than `memory_mapped_source` when only a small portion of the file is read - * through the host. - */ -class direct_read_source : public file_source { - public: - explicit direct_read_source(char const* filepath) : file_source(filepath) {} - - std::unique_ptr host_read(size_t offset, size_t size) override + void unmap() { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - ssize_t const read_size = std::min(size, _file.size() - offset); - - std::vector v(read_size); - CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); - return buffer::create(std::move(v)); + if (_map_addr != nullptr) { + auto const result = munmap(_map_addr, _map_size); + if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); } + _map_addr = nullptr; + } } - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - auto const read_size = std::min(size, _file.size() - offset); + private: + size_t _map_offset = 0; + size_t _map_size = 0; + void* _map_addr = nullptr; - CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), - "read failed"); - return read_size; - } + size_t _reg_offset = 0; + size_t _reg_size = 0; + void* _reg_addr = nullptr; }; /** @@ -286,17 +256,18 @@ class device_buffer_source final : public datasource { size_t host_read(size_t offset, size_t size, uint8_t* dst) override { auto const count = std::min(size, this->size() - offset); - auto const stream = cudf::get_default_stream(); - CUDF_CUDA_TRY( - cudaMemcpyAsync(dst, _d_buffer.data() + offset, count, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + auto const stream = cudf::detail::global_cuda_stream_pool().get_stream(); + cudf::detail::cuda_memcpy(host_span{dst, count}, + device_span{ + reinterpret_cast(_d_buffer.data() + offset), count}, + stream); return count; } std::unique_ptr host_read(size_t offset, size_t size) override { auto const count = std::min(size, this->size() - offset); - auto const stream = cudf::get_default_stream(); + auto const stream = cudf::detail::global_cuda_stream_pool().get_stream(); auto h_data = cudf::detail::make_host_vector_async( cudf::device_span{_d_buffer.data() + offset, count}, stream); stream.synchronize(); @@ -427,20 +398,118 @@ class user_datasource_wrapper : public datasource { datasource* const source; ///< A non-owning pointer to the user-implemented datasource }; +#ifdef CUDF_KVIKIO_REMOTE_IO +/** + * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. + */ +class remote_file_source : public datasource { + static std::unique_ptr create_s3_endpoint(char const* filepath) + { + auto [bucket_name, bucket_object] = kvikio::S3Endpoint::parse_s3_url(filepath); + return std::make_unique(bucket_name, bucket_object); + } + + public: + explicit remote_file_source(char const* filepath) : _kvikio_file{create_s3_endpoint(filepath)} {} + + ~remote_file_source() override = default; + + [[nodiscard]] bool supports_device_read() const override { return true; } + + [[nodiscard]] bool is_device_read_preferred(size_t size) const override { return true; } + + [[nodiscard]] size_t size() const override { return _kvikio_file.nbytes(); } + + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + + auto const read_size = std::min(size, this->size() - offset); + return _kvikio_file.pread(dst, read_size, offset); + } + + size_t device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + return device_read_async(offset, size, dst, stream).get(); + } + + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override + { + rmm::device_buffer out_data(size, stream); + size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); + out_data.resize(read, stream); + return datasource::buffer::create(std::move(out_data)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + auto const read_size = std::min(size, this->size() - offset); + return _kvikio_file.pread(dst, read_size, offset).get(); + } + + std::unique_ptr host_read(size_t offset, size_t size) override + { + auto const count = std::min(size, this->size() - offset); + std::vector h_data(count); + this->host_read(offset, count, h_data.data()); + return datasource::buffer::create(std::move(h_data)); + } + + /** + * @brief Is `url` referring to a remote file supported by KvikIO? + * + * For now, only S3 urls (urls starting with "s3://") are supported. + */ + static bool is_supported_remote_url(std::string const& url) + { + // Regular expression to match "s3://" + static std::regex pattern{R"(^s3://)", std::regex_constants::icase}; + return std::regex_search(url, pattern); + } + + private: + kvikio::RemoteHandle _kvikio_file; +}; +#else +/** + * @brief When KvikIO remote IO is disabled, `is_supported_remote_url()` return false always. + */ +class remote_file_source : public file_source { + public: + explicit remote_file_source(char const* filepath) : file_source(filepath) {} + static constexpr bool is_supported_remote_url(std::string const&) { return false; } +}; +#endif } // namespace std::unique_ptr datasource::create(std::string const& filepath, size_t offset, - size_t size) + size_t max_size_estimate) { -#ifdef CUFILE_FOUND - if (cufile_integration::is_always_enabled()) { - // avoid mmap as GDS is expected to be used for most reads - return std::make_unique(filepath.c_str()); + auto const use_memory_mapping = [] { + auto const policy = getenv_or("LIBCUDF_MMAP_ENABLED", std::string{"ON"}); + + if (policy == "ON") { return true; } + if (policy == "OFF") { return false; } + + CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy); + }(); + if (remote_file_source::is_supported_remote_url(filepath)) { + return std::make_unique(filepath.c_str()); + } else if (use_memory_mapping) { + return std::make_unique(filepath.c_str(), offset, max_size_estimate); + } else { + // `file_source` reads the file directly, without memory mapping + return std::make_unique(filepath.c_str()); } -#endif - // Use our own memory mapping implementation for direct file reads - return std::make_unique(filepath.c_str(), offset, size); } std::unique_ptr datasource::create(host_buffer const& buffer) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index d7b54399f8d..f9750e4a505 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -22,8 +22,6 @@ #include #include -#include - #include #include @@ -84,7 +82,7 @@ file_wrapper::file_wrapper(std::string const& filepath, int flags, mode_t mode) file_wrapper::~file_wrapper() { close(fd); } -#ifdef CUFILE_FOUND +#ifdef CUDF_CUFILE_FOUND /** * @brief Class that dynamically loads the cuFile library and manages the cuFile driver. @@ -110,7 +108,11 @@ class cufile_shim { ~cufile_shim() { - if (driver_close != nullptr) driver_close(); + // Explicit cuFile driver close should not be performed here to avoid segfault. However, in the + // absence of driver_close(), cuFile will implicitly do that, which in most cases causes + // segfault anyway. TODO: Revisit this conundrum once cuFile is fixed. + // https://github.com/rapidsai/cudf/issues/17121 + if (cf_lib != nullptr) dlclose(cf_lib); } @@ -239,7 +241,7 @@ std::vector> make_sliced_tasks( std::vector> slice_tasks; std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) { return pool.submit_task( - [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); }); + [=] { return function(ptr + slice.offset, slice.size, offset + slice.offset); }); }); return slice_tasks; } diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 7e47b5b3d10..c844a596869 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -16,7 +16,7 @@ #pragma once -#ifdef CUFILE_FOUND +#ifdef CUDF_CUFILE_FOUND #include #include @@ -97,14 +97,14 @@ class cufile_output { virtual std::future write_async(void const* data, size_t offset, size_t size) = 0; }; -#ifdef CUFILE_FOUND +#ifdef CUDF_CUFILE_FOUND class cufile_shim; /** * @brief Class that provides RAII for cuFile file registration. */ -struct cufile_registered_file { +class cufile_registered_file { void register_handle(); public: diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp index d9eac423901..a3ddef52dd8 100644 --- a/cpp/src/io/utilities/hostdevice_span.hpp +++ b/cpp/src/io/utilities/hostdevice_span.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -33,31 +34,18 @@ class hostdevice_span { hostdevice_span(hostdevice_span const&) = default; ///< Copy constructor hostdevice_span(hostdevice_span&&) = default; ///< Move constructor - hostdevice_span(T* cpu_data, T* gpu_data, size_t size) - : _size(size), _device_data(gpu_data), _host_data(cpu_data) + hostdevice_span(host_span host_data, T* device_data) + : _host_data{host_data}, _device_data{device_data} { } - /// Constructor from container - /// @param in The container to construct the span from - template ().host_ptr())> (*)[], - T (*)[]>>* = nullptr> - constexpr hostdevice_span(C& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) - { - } - - /// Constructor from const container - /// @param in The container to construct the span from - template ().host_ptr())> (*)[], - T (*)[]>>* = nullptr> - constexpr hostdevice_span(C const& in) - : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) + // Copy construction to support const conversion + /// @param other The span to copy + template , // NOLINT + void>* = nullptr> + constexpr hostdevice_span(hostdevice_span const& other) noexcept + : _host_data{host_span{other}}, _device_data{other.device_ptr()} { } @@ -74,15 +62,13 @@ class hostdevice_span { * @tparam T The device span type. * @return A typed device span of the hostdevice view's data. */ - [[nodiscard]] operator cudf::device_span() { return {_device_data, size()}; } - - /** - * @brief Converts a hostdevice view into a device span of const data. - * - * @tparam T The device span type. - * @return A const typed device span of the hostdevice view's data. - */ - [[nodiscard]] operator cudf::device_span() const { return {_device_data, size()}; } + template , // NOLINT + void>* = nullptr> + [[nodiscard]] operator cudf::device_span() const noexcept + { + return {_device_data, size()}; + } /** * @brief Returns the underlying device data. @@ -114,9 +100,12 @@ class hostdevice_span { * @tparam T The host span type. * @return A typed host span of the hostdevice_span's data. */ - [[nodiscard]] operator cudf::host_span() const noexcept + template , // NOLINT + void>* = nullptr> + [[nodiscard]] operator host_span() const noexcept { - return cudf::host_span(_host_data, size()); + return {_host_data}; } /** @@ -125,7 +114,7 @@ class hostdevice_span { * @tparam T The type to cast to * @return T* Typed pointer to underlying data */ - [[nodiscard]] T* host_ptr(size_t offset = 0) const noexcept { return _host_data + offset; } + [[nodiscard]] T* host_ptr(size_t offset = 0) const noexcept { return _host_data.data() + offset; } /** * @brief Return first element in host data. @@ -136,19 +125,19 @@ class hostdevice_span { [[nodiscard]] T* host_begin() const noexcept { return host_ptr(); } /** - * @brief Return one past the last elementin host data. + * @brief Return one past the last element in host data. * * @tparam T The desired type * @return T const* Pointer to one past the last element */ - [[nodiscard]] T* host_end() const noexcept { return host_begin() + size(); } + [[nodiscard]] T* host_end() const noexcept { return _host_data.end(); } /** * @brief Returns the number of elements in the view * * @return The number of elements in the view */ - [[nodiscard]] std::size_t size() const noexcept { return _size; } + [[nodiscard]] std::size_t size() const noexcept { return _host_data.size(); } /** * @brief Returns true if `size()` returns zero, or false otherwise @@ -159,12 +148,11 @@ class hostdevice_span { [[nodiscard]] size_t size_bytes() const noexcept { return sizeof(T) * size(); } - [[nodiscard]] T& operator[](size_t i) { return _host_data[i]; } - [[nodiscard]] T const& operator[](size_t i) const { return _host_data[i]; } + [[nodiscard]] T& operator[](size_t i) const { return _host_data[i]; } /** - * @brief Obtains a hostdevice_span that is a view over the `count` elements of this - * hostdevice_span starting at offset + * @brief Obtains a `hostdevice_span` that is a view over the `count` elements of this + * hostdevice_span starting at `offset` * * @param offset The offset of the first element in the subspan * @param count The number of elements in the subspan @@ -172,37 +160,37 @@ class hostdevice_span { */ [[nodiscard]] constexpr hostdevice_span subspan(size_t offset, size_t count) const noexcept { - return hostdevice_span(_host_data + offset, _device_data + offset, count); + return hostdevice_span(_host_data.subspan(offset, count), device_ptr(offset)); } - void host_to_device_async(rmm::cuda_stream_view stream) + void host_to_device_async(rmm::cuda_stream_view stream) const { - CUDF_CUDA_TRY( - cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value())); + static_assert(not std::is_const_v, "Cannot copy to const device memory"); + cudf::detail::cuda_memcpy_async(device_span{device_ptr(), size()}, _host_data, stream); } - void host_to_device_sync(rmm::cuda_stream_view stream) + void host_to_device_sync(rmm::cuda_stream_view stream) const { host_to_device_async(stream); stream.synchronize(); } - void device_to_host_async(rmm::cuda_stream_view stream) + void device_to_host_async(rmm::cuda_stream_view stream) const { - CUDF_CUDA_TRY( - cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value())); + static_assert(not std::is_const_v, "Cannot copy to const host memory"); + cudf::detail::cuda_memcpy_async( + _host_data, device_span{device_ptr(), size()}, stream); } - void device_to_host_sync(rmm::cuda_stream_view stream) + void device_to_host_sync(rmm::cuda_stream_view stream) const { device_to_host_async(stream); stream.synchronize(); } private: - size_t _size{}; ///< Number of elements - T* _device_data{}; ///< Pointer to device memory containing elements - T* _host_data{}; ///< Pointer to host memory containing elements + host_span _host_data; + T* _device_data{nullptr}; }; } // namespace cudf::detail diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index aed745c42dd..f969b45727b 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -117,55 +117,39 @@ class hostdevice_vector { return d_data.element(element_index, stream); } - operator cudf::host_span() { return {host_ptr(), size()}; } - operator cudf::host_span() const { return {host_ptr(), size()}; } + operator cudf::host_span() { return host_span{h_data}.subspan(0, size()); } + operator cudf::host_span() const + { + return host_span{h_data}.subspan(0, size()); + } operator cudf::device_span() { return {device_ptr(), size()}; } operator cudf::device_span() const { return {device_ptr(), size()}; } void host_to_device_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(d_data, h_data, stream); } - void host_to_device_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy(d_data, h_data, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(h_data, d_data, stream); } - void device_to_host_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy(h_data, d_data, stream); } /** * @brief Converts a hostdevice_vector into a hostdevice_span. * * @return A typed hostdevice_span of the hostdevice_vector's data */ - [[nodiscard]] operator hostdevice_span() - { - return hostdevice_span{h_data.data(), d_data.data(), size()}; - } + [[nodiscard]] operator hostdevice_span() { return {host_span{h_data}, device_ptr()}; } - /** - * @brief Converts a part of a hostdevice_vector into a hostdevice_span. - * - * @param offset The offset of the first element in the subspan - * @param count The number of elements in the subspan - * @return A typed hostdevice_span of the hostdevice_vector's data - */ - [[nodiscard]] hostdevice_span subspan(size_t offset, size_t count) + [[nodiscard]] operator hostdevice_span() const { - CUDF_EXPECTS(offset < d_data.size(), "Offset is out of bounds."); - CUDF_EXPECTS(count <= d_data.size() - offset, - "The span with given offset and count is out of bounds."); - return hostdevice_span{h_data.data() + offset, d_data.data() + offset, count}; + return {host_span{h_data}, device_ptr()}; } private: @@ -188,38 +172,47 @@ class hostdevice_2dvector { { } - operator device_2dspan() { return {_data.device_ptr(), _size}; } - operator device_2dspan() const { return {_data.device_ptr(), _size}; } + operator device_2dspan() { return {device_span{_data}, _size.second}; } + operator device_2dspan() const { return {device_span{_data}, _size.second}; } device_2dspan device_view() { return static_cast>(*this); } - device_2dspan device_view() const { return static_cast>(*this); } + [[nodiscard]] device_2dspan device_view() const + { + return static_cast>(*this); + } - operator host_2dspan() { return {_data.host_ptr(), _size}; } - operator host_2dspan() const { return {_data.host_ptr(), _size}; } + operator host_2dspan() { return {host_span{_data}, _size.second}; } + operator host_2dspan() const { return {host_span{_data}, _size.second}; } host_2dspan host_view() { return static_cast>(*this); } - host_2dspan host_view() const { return static_cast>(*this); } + [[nodiscard]] host_2dspan host_view() const + { + return static_cast>(*this); + } host_span operator[](size_t row) { - return {_data.host_ptr() + host_2dspan::flatten_index(row, 0, _size), _size.second}; + return host_span{_data}.subspan(row * _size.second, _size.second); } host_span operator[](size_t row) const { - return {_data.host_ptr() + host_2dspan::flatten_index(row, 0, _size), _size.second}; + return host_span{_data}.subspan(row * _size.second, _size.second); } - auto size() const noexcept { return _size; } - auto count() const noexcept { return _size.first * _size.second; } - auto is_empty() const noexcept { return count() == 0; } + [[nodiscard]] auto size() const noexcept { return _size; } + [[nodiscard]] auto count() const noexcept { return _size.first * _size.second; } + [[nodiscard]] auto is_empty() const noexcept { return count() == 0; } T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); } T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); } - T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); } + [[nodiscard]] T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); } - T const* base_device_ptr(size_t offset = 0) const { return _data.device_ptr(offset); } + [[nodiscard]] T const* base_device_ptr(size_t offset = 0) const + { + return _data.device_ptr(offset); + } [[nodiscard]] size_t size_bytes() const noexcept { return _data.size_bytes(); } diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh index f7e6de03354..8183a66f4f0 100644 --- a/cpp/src/io/utilities/output_builder.cuh +++ b/cpp/src/io/utilities/output_builder.cuh @@ -307,8 +307,8 @@ class output_builder { * @param mr The memory resource used to allocate the output vector. * @return The output vector. */ - rmm::device_uvector gather(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + [[nodiscard]] rmm::device_uvector gather(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { rmm::device_uvector output{size(), stream, mr}; auto output_it = output.begin(); diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp index c0bbca39167..cf252fe63af 100644 --- a/cpp/src/io/utilities/row_selection.cpp +++ b/cpp/src/io/utilities/row_selection.cpp @@ -16,10 +16,7 @@ #include "io/utilities/row_selection.hpp" -#include - #include -#include namespace cudf::io::detail { diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp index 7c607099cdc..e826feff201 100644 --- a/cpp/src/io/utilities/row_selection.hpp +++ b/cpp/src/io/utilities/row_selection.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu index 43dc38c4ac6..af32b207d20 100644 --- a/cpp/src/io/utilities/type_inference.cu +++ b/cpp/src/io/utilities/type_inference.cu @@ -18,11 +18,10 @@ #include "io/utilities/string_parsing.hpp" #include "io/utilities/trie.cuh" +#include #include #include -#include - #include #include @@ -242,7 +241,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, constexpr int block_size = 128; auto const grid_size = (size + block_size - 1) / block_size; - auto d_column_info = rmm::device_scalar(stream); + auto d_column_info = cudf::detail::device_scalar(stream); CUDF_CUDA_TRY(cudaMemsetAsync( d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp index 89c47d246d0..34a0bdce124 100644 --- a/cpp/src/jit/cache.cpp +++ b/cpp/src/jit/cache.cpp @@ -16,11 +16,8 @@ #include -#include - #include -#include #include namespace cudf { diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp index 398c36821cc..519ac2d1a2e 100644 --- a/cpp/src/jit/parser.cpp +++ b/cpp/src/jit/parser.cpp @@ -19,8 +19,6 @@ #include #include -#include -#include #include #include #include @@ -28,7 +26,7 @@ namespace cudf { namespace jit { -constexpr char percent_escape[] = "_"; +constexpr char percent_escape[] = "_"; // NOLINT inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; } diff --git a/cpp/src/jit/util.cpp b/cpp/src/jit/util.cpp index 0585e02a031..d9a29203133 100644 --- a/cpp/src/jit/util.cpp +++ b/cpp/src/jit/util.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,6 @@ #include #include -#include - namespace cudf { namespace jit { struct get_data_ptr_functor { diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 2ec23e0dc6d..781fda215fd 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -21,12 +21,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include @@ -81,7 +83,7 @@ std::unique_ptr> conditional_join_anti_semi( join_size = *output_size; } else { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); if (has_nulls) { compute_conditional_join_output_size <<>>( @@ -94,7 +96,7 @@ std::unique_ptr> conditional_join_anti_semi( join_size = size.value(stream); } - rmm::device_scalar write_index(0, stream); + cudf::detail::device_scalar write_index(0, stream); auto left_indices = std::make_unique>(join_size, stream, mr); @@ -177,7 +179,8 @@ conditional_join(table_view const& left, auto const parser = ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr}; CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); + "The expression must produce a boolean output.", + cudf::data_type_error); auto left_table = table_device_view::create(left, stream); auto right_table = table_device_view::create(right, stream); @@ -197,7 +200,7 @@ conditional_join(table_view const& left, join_size = *output_size; } else { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); if (has_nulls) { compute_conditional_join_output_size <<>>( @@ -231,7 +234,7 @@ conditional_join(table_view const& left, std::make_unique>(0, stream, mr)); } - rmm::device_scalar write_index(0, stream); + cudf::detail::device_scalar write_index(0, stream); auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); @@ -329,7 +332,8 @@ std::size_t compute_conditional_join_output_size(table_view const& left, auto const parser = ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr}; CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); + "The expression must produce a boolean output.", + cudf::data_type_error); auto left_table = table_device_view::create(left, stream); auto right_table = table_device_view::create(right, stream); @@ -342,7 +346,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left, auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); // Determine number of output rows without actually building the output to simply // find what the size of the output will be. diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index c7294152982..ce4d2067b82 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -27,14 +27,15 @@ #include #include -#include #include #include #include #include #include +#include #include +#include #include #include @@ -80,14 +81,9 @@ class build_keys_fn { /** * @brief Device output transform functor to construct `size_type` with `cuco::pair` or `cuco::pair` + * rhs_index_type>` */ struct output_fn { - __device__ constexpr cudf::size_type operator()( - cuco::pair const& x) const - { - return static_cast(x.second); - } __device__ constexpr cudf::size_type operator()( cuco::pair const& x) const { @@ -177,15 +173,33 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, auto const iter = cudf::detail::make_counting_transform_iterator( 0, build_keys_fn{d_probe_hasher}); - auto const build_indices_begin = - thrust::make_transform_output_iterator(build_indices->begin(), output_fn{}); - auto const probe_indices_begin = - thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{}); - - auto const [probe_indices_end, _] = this->_hash_table.retrieve( - iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()}); + auto found_indices = rmm::device_uvector(probe_table_num_rows, stream); + auto const found_begin = + thrust::make_transform_output_iterator(found_indices.begin(), output_fn{}); + + // TODO conditional find for nulls once `cuco::static_set::find_if` is added + // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not equal + // to `JoinNoneValue`, then `idx` has a match in the hash set. + this->_hash_table.find_async(iter, iter + probe_table_num_rows, found_begin, stream.value()); + + auto const tuple_iter = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type>( + [found_iter = found_indices.begin()] __device__(size_type idx) { + return thrust::tuple{*(found_iter + idx), idx}; + })); + auto const output_begin = + thrust::make_zip_iterator(build_indices->begin(), probe_indices->begin()); + auto const output_end = + thrust::copy_if(rmm::exec_policy_nosync(stream), + tuple_iter, + tuple_iter + probe_table_num_rows, + found_indices.begin(), + output_begin, + cuda::proclaim_return_type( + [] __device__(size_type idx) { return idx != JoinNoneValue; })); + auto const actual_size = std::distance(output_begin, output_end); - auto const actual_size = std::distance(probe_indices_begin, probe_indices_end); build_indices->resize(actual_size, stream); probe_indices->resize(actual_size, stream); diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 86402a0e7de..573101cefd9 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -51,11 +50,6 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; -using semi_map_type = cuco::legacy::static_map>; - using row_hash_legacy = cudf::row_hasher; diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 820b81ee309..90b0d0a45ad 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -115,7 +116,8 @@ mixed_join( auto const parser = ast::detail::expression_parser{ binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr}; CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); + "The expression must produce a boolean output.", + cudf::data_type_error); // TODO: The non-conditional join impls start with a dictionary matching, // figure out what that is and what it's needed for (and if conditional joins @@ -381,7 +383,8 @@ compute_mixed_join_output_size(table_view const& left_equality, auto const parser = ast::detail::expression_parser{ binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr}; CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); + "The expression must produce a boolean output.", + cudf::data_type_error); // TODO: The non-conditional join impls start with a dictionary matching, // figure out what that is and what it's needed for (and if conditional joins diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index 19701816867..4a52cfe098a 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -25,6 +25,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -160,6 +161,39 @@ struct pair_expression_equality : public expression_equality { } }; +/** + * @brief Equality comparator that composes two row_equality comparators. + */ +struct double_row_equality_comparator { + row_equality const equality_comparator; + row_equality const conditional_comparator; + + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept + { + using experimental::row::lhs_index_type; + using experimental::row::rhs_index_type; + + return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && + conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); + } +}; + +// A CUDA Cooperative Group of 1 thread for the hash set for mixed semi. +auto constexpr DEFAULT_MIXED_SEMI_JOIN_CG_SIZE = 1; + +// The hash set type used by mixed_semi_join with the build_table. +using hash_set_type = + cuco::static_set, + cuda::thread_scope_device, + double_row_equality_comparator, + cuco::linear_probing, + cudf::detail::cuco_allocator, + cuco::storage<1>>; + +// The hash_set_ref_type used by mixed_semi_join kerenels for probing. +using hash_set_ref_type = hash_set_type::ref_type; + } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 7459ac3e99c..a4ec97af235 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -38,38 +38,48 @@ CUDF_KERNEL void __launch_bounds__(block_size) table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data) { + auto constexpr cg_size = hash_set_ref_type::cg_size; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is // used to circumvent conflicts between arrays of different types between // different template instantiations due to the extern specifier. extern __shared__ char raw_intermediate_storage[]; - cudf::ast::detail::IntermediateDataType* intermediate_storage = + auto intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = - &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; + intermediate_storage + (tile.meta_group_rank() * device_expression_data.num_intermediates); - cudf::size_type const left_num_rows = left_table.num_rows(); - cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = left_num_rows; + // Equality evaluator to use + auto const evaluator = cudf::ast::detail::expression_evaluator( + left_table, right_table, device_expression_data); - cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; + // Make sure to swap_tables here as hash_set will use probe table as the left one + auto constexpr swap_tables = true; + auto const equality = single_expression_equality{ + evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, device_expression_data); + // Create set ref with the new equality comparator + auto const set_ref_equality = set_ref.rebind_key_eq(equality); - if (outer_row_index < outer_num_rows) { - // Figure out the number of elements for this key. - auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, false, equality_probe}; + // Total number of rows to query the set + auto const outer_num_rows = left_table.num_rows(); + // Grid stride for the tile + auto const cg_grid_stride = cudf::detail::grid_1d::grid_stride() / cg_size; - left_table_keep_mask[outer_row_index] = - hash_table_view.contains(outer_row_index, hash_probe, equality); + // Find all the rows in the left table that are in the hash table + for (auto outer_row_index = cudf::detail::grid_1d::global_thread_id() / cg_size; + outer_row_index < outer_num_rows; + outer_row_index += cg_grid_stride) { + auto const result = set_ref_equality.contains(tile, outer_row_index); + if (tile.thread_rank() == 0) { left_table_keep_mask[outer_row_index] = result; } } } @@ -78,9 +88,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, @@ -94,9 +103,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } else { @@ -106,9 +114,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index 43714ffb36a..b08298e64e4 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -45,9 +45,8 @@ namespace detail { * @param[in] right_table The right table * @param[in] probe The table with which to probe the hash table for matches. * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] hash_table_view The hash table built from `build`. + * @param[in] set_ref The hash table device view built from `build`. * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired @@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index aa4fa281159..62ba558b0bd 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -45,45 +45,6 @@ namespace cudf { namespace detail { -namespace { -/** - * @brief Device functor to create a pair of hash value and index for a given row. - */ -struct make_pair_function_semi { - __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept - { - // The value is irrelevant since we only ever use the hash map to check for - // membership of a particular row index. - return cuco::make_pair(static_cast(i), 0); - } -}; - -/** - * @brief Equality comparator that composes two row_equality comparators. - */ -class double_row_equality { - public: - double_row_equality(row_equality equality_comparator, row_equality conditional_comparator) - : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator} - { - } - - __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept - { - using experimental::row::lhs_index_type; - using experimental::row::rhs_index_type; - - return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && - _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); - } - - private: - row_equality _equality_comparator; - row_equality _conditional_comparator; -}; - -} // namespace - std::unique_ptr> mixed_join_semi( table_view const& left_equality, table_view const& right_equality, @@ -95,7 +56,7 @@ std::unique_ptr> mixed_join_semi( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && + CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and (join_type != join_kind::FULL_JOIN), "Inner, left, and full joins should use mixed_join."); @@ -136,7 +97,7 @@ std::unique_ptr> mixed_join_semi( // output column and follow the null-supporting expression evaluation code // path. auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || + cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; auto const parser = ast::detail::expression_parser{ @@ -155,27 +116,20 @@ std::unique_ptr> mixed_join_semi( auto right_conditional_view = table_device_view::create(right_conditional, stream); auto const preprocessed_build = - experimental::row::equality::preprocessed_table::create(build, stream); + cudf::experimental::row::equality::preprocessed_table::create(build, stream); auto const preprocessed_probe = - experimental::row::equality::preprocessed_table::create(probe, stream); + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; + cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); - semi_map_type hash_table{ - compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - // Create hash table containing all keys found in right table // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we // won't be able to support AST conditions for those types anyway. auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - auto const hash_build = row_hash_build.device_hasher(build_nulls); + // Since we may see multiple rows that are identical in the equality tables // but differ in the conditional tables, the equality comparator used for // insertion must account for both sets of tables. An alternative solution @@ -190,20 +144,28 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create(right_conditional, stream); + cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; auto const equality_build_conditional = row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); - double_row_equality equality_build{equality_build_equality, equality_build_conditional}; - make_pair_function_semi pair_func_build{}; - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); + hash_set_type row_set{ + {compute_hash_table_size(build.num_rows())}, + cuco::empty_key{JoinNoneValue}, + {equality_build_equality, equality_build_conditional}, + {row_hash_build.device_hasher(build_nulls)}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + {stream.value()}}; + + auto iter = thrust::make_counting_iterator(0); // skip rows that are null here. if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); + row_set.insert_async(iter, iter + right_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); auto const [row_bitmask, _] = @@ -211,18 +173,20 @@ std::unique_ptr> mixed_join_semi( row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows - hash_table.insert_if( - iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); + row_set.insert_if_async(iter, iter + right_num_rows, stencil, pred, stream.value()); } - auto hash_table_view = hash_table.get_device_view(); - - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; + detail::grid_1d const config(outer_num_rows * hash_set_type::cg_size, DEFAULT_JOIN_BLOCK_SIZE); + auto const shmem_size_per_block = + parser.shmem_per_thread * + cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size); auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); + hash_set_ref_type const row_set_ref = + row_set.ref(cuco::contains).rebind_hash_function(hash_probe); + // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); @@ -231,9 +195,8 @@ std::unique_ptr> mixed_join_semi( *right_conditional_view, *probe_view, *build_view, - hash_probe, equality_probe, - hash_table_view, + row_set_ref, cudf::device_span(left_table_keep_mask), parser.device_expression_data, config, diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 84e9be45030..4049ccf35e1 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -122,7 +123,7 @@ std::size_t launch_compute_mixed_join_output_size( rmm::device_async_resource_ref mr) { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); compute_mixed_join_output_size <<>>( diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index 59fdbedf089..fb5cf66dd60 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1031,7 +1032,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); // compute results - rmm::device_scalar d_valid_count{0, stream}; + cudf::detail::device_scalar d_valid_count{0, stream}; get_json_object_kernel <<>>( diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 17008e80e79..ebab3beb08f 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -834,10 +834,11 @@ std::pair, std::vector> partition( table_view const& t, column_view const& partition_map, size_type num_partitions, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr); + return detail::partition(t, partition_map, num_partitions, stream, mr); } } // namespace cudf diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu index 5a4c90a67a5..ab6ab393878 100644 --- a/cpp/src/partitioning/round_robin.cu +++ b/cpp/src/partitioning/round_robin.cu @@ -273,11 +273,11 @@ std::pair, std::vector> round_robi table_view const& input, cudf::size_type num_partitions, cudf::size_type start_partition, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_robin_partition( - input, num_partitions, start_partition, cudf::get_default_stream(), mr); + return detail::round_robin_partition(input, num_partitions, start_partition, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 80fd72a3088..21f6fe87a62 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -195,10 +195,11 @@ std::unique_ptr quantile(column_view const& input, interpolation interp, column_view const& ordered_indices, bool exact, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr); + return detail::quantile(input, q, interp, ordered_indices, exact, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index 69421f3bfc4..a94fb9362b9 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -103,17 +103,12 @@ std::unique_ptr
quantiles(table_view const& input, cudf::sorted is_input_sorted, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::quantiles(input, - q, - interp, - is_input_sorted, - column_order, - null_precedence, - cudf::get_default_stream(), - mr); + return detail::quantiles( + input, q, interp, is_input_sorted, column_order, null_precedence, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 0d017cf1f13..fb5aebb4b39 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -292,32 +292,33 @@ std::unique_ptr make_tdigest_column(size_type num_rows, return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr); } -std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr make_empty_tdigests_column(size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto offsets = cudf::make_fixed_width_column( - data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); + data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), offsets->mutable_view().begin(), offsets->mutable_view().end(), 0); - auto min_col = - cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + auto min_col = cudf::make_numeric_column( + data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), min_col->mutable_view().begin(), min_col->mutable_view().end(), 0); - auto max_col = - cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + auto max_col = cudf::make_numeric_column( + data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), max_col->mutable_view().begin(), max_col->mutable_view().end(), 0); - return make_tdigest_column(1, - make_empty_column(type_id::FLOAT64), - make_empty_column(type_id::FLOAT64), + return make_tdigest_column(num_rows, + cudf::make_empty_column(type_id::FLOAT64), + cudf::make_empty_column(type_id::FLOAT64), std::move(offsets), std::move(min_col), std::move(max_col), @@ -338,7 +339,7 @@ std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto contents = make_empty_tdigest_column(stream, mr)->release(); + auto contents = make_empty_tdigests_column(1, stream, mr)->release(); return std::make_unique( std::move(*std::make_unique
(std::move(contents.children))), true, stream, mr); } @@ -409,10 +410,11 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, std::unique_ptr percentile_approx(tdigest_column_view const& input, column_view const& percentiles, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr); + return tdigest::percentile_approx(input, percentiles, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index e1c1d2e3002..d27420658d6 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -169,19 +169,19 @@ struct nearest_value_scalar_weights { */ template struct nearest_value_centroid_weights { - double const* cumulative_weights; - GroupOffsetsIter outer_offsets; // groups - size_type const* inner_offsets; // tdigests within a group + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupOffsetsIter group_offsets; // groups + size_type const* tdigest_offsets; // tdigests within a group thrust::pair operator() __device__(double next_limit, size_type group_index) const { - auto const tdigest_begin = outer_offsets[group_index]; - auto const tdigest_end = outer_offsets[group_index + 1]; - auto const num_weights = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin]; + auto const tdigest_begin = group_offsets[group_index]; + auto const tdigest_end = group_offsets[group_index + 1]; + auto const num_weights = tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin]; // NOTE: as it is today, this functor will never be called for any digests that are empty, but // I'll leave this check here for safety. if (num_weights == 0) { return thrust::pair{0, 0}; } - double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin]; + double const* group_cumulative_weights = cumulative_weights + tdigest_offsets[tdigest_begin]; auto const index = ((thrust::lower_bound(thrust::seq, group_cumulative_weights, @@ -235,21 +235,26 @@ struct cumulative_scalar_weight { */ template struct cumulative_centroid_weight { - double const* cumulative_weights; - GroupLabelsIter group_labels; - GroupOffsetsIter outer_offsets; // groups - cudf::device_span inner_offsets; // tdigests with a group - + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupLabelsIter group_labels; // group labels for each tdigest including empty ones + GroupOffsetsIter group_offsets; // groups + cudf::device_span tdigest_offsets; // tdigests with a group + + /** + * @brief Returns the cumulative weight for a given value index. The index `n` is the index of + * `n`-th non-empty cluster. + */ std::tuple operator() __device__(size_type value_index) const { auto const tdigest_index = static_cast( - thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) - - inner_offsets.begin()) - + thrust::upper_bound( + thrust::seq, tdigest_offsets.begin(), tdigest_offsets.end(), value_index) - + tdigest_offsets.begin()) - 1; auto const group_index = group_labels[tdigest_index]; - auto const first_tdigest_index = outer_offsets[group_index]; - auto const first_weight_index = inner_offsets[first_tdigest_index]; + auto const first_tdigest_index = group_offsets[group_index]; + auto const first_weight_index = tdigest_offsets[first_tdigest_index]; auto const relative_value_index = value_index - first_weight_index; double const* group_cumulative_weights = cumulative_weights + first_weight_index; @@ -284,15 +289,15 @@ struct scalar_group_info { // retrieve group info of centroid inputs by group index template struct centroid_group_info { - double const* cumulative_weights; - GroupOffsetsIter outer_offsets; - size_type const* inner_offsets; + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupOffsetsIter group_offsets; + size_type const* tdigest_offsets; __device__ thrust::tuple operator()(size_type group_index) const { // if there's no weights in this group of digests at all, return 0. - auto const group_start = inner_offsets[outer_offsets[group_index]]; - auto const group_end = inner_offsets[outer_offsets[group_index + 1]]; + auto const group_start = tdigest_offsets[group_offsets[group_index]]; + auto const group_end = tdigest_offsets[group_offsets[group_index + 1]]; auto const num_weights = group_end - group_start; auto const last_weight_index = group_end - 1; return num_weights == 0 @@ -367,7 +372,6 @@ std::unique_ptr to_tdigest_scalar(std::unique_ptr&& tdigest, * @param group_num_clusters Output. The number of output clusters for each input group. * @param group_cluster_offsets Offsets per-group to the start of it's clusters * @param has_nulls Whether or not the input contains nulls - * */ template @@ -661,6 +665,10 @@ std::unique_ptr build_output_column(size_type num_rows, mr); } +/** + * @brief A functor which returns the cluster index within a group that the value at + * the given value index falls into. + */ template struct compute_tdigests_keys_fn { int const delta; @@ -706,8 +714,8 @@ struct compute_tdigests_keys_fn { * boundaries. * * @param delta tdigest compression level - * @param values_begin Beginning of the range of input values. - * @param values_end End of the range of input values. + * @param centroids_begin Beginning of the range of centroids. + * @param centroids_end End of the range of centroids. * @param cumulative_weight Functor which returns cumulative weight and group information for * an absolute input value index. * @param min_col Column containing the minimum value per group. @@ -750,7 +758,9 @@ std::unique_ptr compute_tdigests(int delta, // double // max // } // - if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } + if (total_clusters == 0) { + return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); + } // each input group represents an individual tdigest. within each tdigest, we want the keys // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall @@ -983,38 +993,54 @@ struct typed_reduce_tdigest { } }; -// utility for merge_tdigests. +/** + * @brief Functor to compute the number of clusters in each group. + * + * Used in `merge_tdigests`. + */ template -struct group_num_weights_func { - GroupOffsetsIter outer_offsets; - size_type const* inner_offsets; +struct group_num_clusters_func { + GroupOffsetsIter group_offsets; + size_type const* tdigest_offsets; __device__ size_type operator()(size_type group_index) { - auto const tdigest_begin = outer_offsets[group_index]; - auto const tdigest_end = outer_offsets[group_index + 1]; - return inner_offsets[tdigest_end] - inner_offsets[tdigest_begin]; + auto const tdigest_begin = group_offsets[group_index]; + auto const tdigest_end = group_offsets[group_index + 1]; + return tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin]; } }; -// utility for merge_tdigests. +/** + * @brief Function to determine if a group is empty. + * + * Used in `merge_tdigests`. + */ struct group_is_empty { __device__ bool operator()(size_type group_size) { return group_size == 0; } }; -// utility for merge_tdigests. +/** + * @brief Functor that returns the grouping key for each tdigest cluster. + * + * Used in `merge_tdigests`. + */ template struct group_key_func { GroupLabelsIter group_labels; - size_type const* inner_offsets; - size_type num_inner_offsets; + size_type const* tdigest_offsets; + size_type num_tdigest_offsets; + /** + * @brief Returns the group index for an absolute cluster index. The index `n` is the index of the + * `n`-th non-empty cluster. + */ __device__ size_type operator()(size_type index) { // what -original- tdigest index this absolute index corresponds to - auto const iter = thrust::prev( - thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index)); - auto const tdigest_index = thrust::distance(inner_offsets, iter); + auto const iter = thrust::prev(thrust::upper_bound( + thrust::seq, tdigest_offsets, tdigest_offsets + num_tdigest_offsets, index)); + auto const tdigest_index = thrust::distance(tdigest_offsets, iter); // what group index the original tdigest belongs to return group_labels[tdigest_index]; @@ -1040,8 +1066,8 @@ std::pair, rmm::device_uvector> generate_mer // each group represents a collection of tdigest columns. each row is 1 tdigest. // within each group, we want to sort all the centroids within all the tdigests - // in that group, using the means as the key. the "outer offsets" represent the indices of the - // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest. + // in that group, using the means as the key. the "group offsets" represent the indices of the + // tdigests, and the "tdigest offsets" represents the list of centroids for a particular tdigest. // // rows // ---- centroid 0 --------- @@ -1054,12 +1080,12 @@ std::pair, rmm::device_uvector> generate_mer // tdigest 3 centroid 7 // centroid 8 // ---- centroid 9 -------- - auto inner_offsets = tdv.centroids().offsets(); + auto tdigest_offsets = tdv.centroids().offsets(); auto centroid_offsets = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type( - [group_offsets, inner_offsets = tdv.centroids().offsets().begin()] __device__( - size_type i) { return inner_offsets[group_offsets[i]]; })); + [group_offsets, tdigest_offsets = tdv.centroids().offsets().begin()] __device__( + size_type i) { return tdigest_offsets[group_offsets[i]]; })); // perform the sort using the means as the key size_t temp_size; @@ -1091,9 +1117,29 @@ std::pair, rmm::device_uvector> generate_mer return {std::move(output_means), std::move(output_weights)}; } -template +/** + * @brief Perform a merge aggregation of tdigests. This function usually takes the input as the + * outputs of multiple `typed_group_tdigest` calls, and merges them. + * + * A tdigest can be empty in the input, which means that there was no valid input data to generate + * it. These empty tdigests will have no centroids (means or weights) and will have a `min` and + * `max` of 0. + * + * @param tdv input tdigests. The tdigests within this column are grouped by key. + * @param group_offsets a device iterator of the offsets to the start of each group. A group is + * counted as one even when the cluster is empty in it. + * @param group_labels a device iterator of the the group label for each tdigest cluster including + * empty clusters. + * @param num_group_labels the number of unique group labels. + * @param num_groups the number of groups. + * @param max_centroids the maximum number of centroids (clusters) in the output (merged) tdigest. + * @param stream CUDA stream + * @param mr device memory resource + * + * @return A column containing the merged tdigests. + */ +template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, - HGroupOffsetIter h_outer_offsets, GroupOffsetIter group_offsets, GroupLabelIter group_labels, size_t num_group_labels, @@ -1133,22 +1179,24 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, thrust::equal_to{}, // key equality check thrust::maximum{}); + auto tdigest_offsets = tdv.centroids().offsets(); + // for any empty groups, set the min and max to be 0. not technically necessary but it makes // testing simpler. - auto group_num_weights = cudf::detail::make_counting_transform_iterator( + auto group_num_clusters = cudf::detail::make_counting_transform_iterator( 0, - group_num_weights_func{group_offsets, - tdv.centroids().offsets().begin()}); + group_num_clusters_func{group_offsets, + tdigest_offsets.begin()}); thrust::replace_if(rmm::exec_policy(stream), merged_min_col->mutable_view().begin(), merged_min_col->mutable_view().end(), - group_num_weights, + group_num_clusters, group_is_empty{}, 0); thrust::replace_if(rmm::exec_policy(stream), merged_max_col->mutable_view().begin(), merged_max_col->mutable_view().end(), - group_num_weights, + group_num_clusters, group_is_empty{}, 0); @@ -1166,14 +1214,13 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, // generate group keys for all centroids in the entire column rmm::device_uvector group_keys(num_centroids, stream, temp_mr); - auto iter = thrust::make_counting_iterator(0); - auto inner_offsets = tdv.centroids().offsets(); + auto iter = thrust::make_counting_iterator(0); thrust::transform(rmm::exec_policy(stream), iter, iter + num_centroids, group_keys.begin(), group_key_func{ - group_labels, inner_offsets.begin(), inner_offsets.size()}); + group_labels, tdigest_offsets.begin(), tdigest_offsets.size()}); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), group_keys.begin(), group_keys.begin() + num_centroids, @@ -1182,20 +1229,24 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto const delta = max_centroids; + // TDigest merge takes the output of typed_group_tdigest as its input, which must not have + // any nulls. + auto const has_nulls = false; + // generate cluster info auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( delta, num_groups, nearest_value_centroid_weights{ - cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + cumulative_weights.begin(), group_offsets, tdigest_offsets.begin()}, centroid_group_info{ - cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + cumulative_weights.begin(), group_offsets, tdigest_offsets.begin()}, cumulative_centroid_weight{ cumulative_weights.begin(), group_labels, group_offsets, - {inner_offsets.begin(), static_cast(inner_offsets.size())}}, - false, + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + has_nulls, stream, mr); @@ -1212,13 +1263,13 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, cumulative_weights.begin(), group_labels, group_offsets, - {inner_offsets.begin(), static_cast(inner_offsets.size())}}, + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, std::move(merged_min_col), std::move(merged_max_col), group_cluster_wl, std::move(group_cluster_offsets), total_clusters, - false, + has_nulls, stream, mr); } @@ -1257,21 +1308,13 @@ std::unique_ptr reduce_merge_tdigest(column_view const& input, if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); } - auto group_offsets_ = group_offsets_fn{input.size()}; - auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); - auto group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); - auto group_labels = thrust::make_constant_iterator(0); - return to_tdigest_scalar(merge_tdigests(tdv, - h_group_offsets, - group_offsets, - group_labels, - input.size(), - 1, - max_centroids, - stream, - mr), - stream, - mr); + auto group_offsets_ = group_offsets_fn{input.size()}; + auto group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); + auto group_labels = thrust::make_constant_iterator(0); + return to_tdigest_scalar( + merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr), + stream, + mr); } std::unique_ptr group_tdigest(column_view const& col, @@ -1283,7 +1326,7 @@ std::unique_ptr group_tdigest(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } + if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); } auto const delta = max_centroids; return cudf::type_dispatcher(col.type(), @@ -1309,19 +1352,18 @@ std::unique_ptr group_merge_tdigest(column_view const& input, tdigest_column_view tdv(input); if (num_groups == 0 || input.size() == 0) { - return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); + return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); } - // bring group offsets back to the host - std::vector h_group_offsets(group_offsets.size()); - cudaMemcpyAsync(h_group_offsets.data(), - group_offsets.begin(), - sizeof(size_type) * group_offsets.size(), - cudaMemcpyDefault, - stream); + if (tdv.means().size() == 0) { + // `group_merge_tdigest` takes the output of `typed_group_tdigest` as its input, which wipes + // out the means and weights for empty clusters. Thus, no mean here indicates that all clusters + // are empty in the input. Let's skip all complex computation in the below, but just return + // an empty tdigest per group. + return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr); + } return merge_tdigests(tdv, - h_group_offsets.begin(), group_offsets.data(), group_labels.data(), group_labels.size(), diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp index a9f86ac1b5f..17844b6bb0a 100644 --- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp +++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu index 67ea29a2cb1..890625830a5 100644 --- a/cpp/src/reductions/all.cu +++ b/cpp/src/reductions/all.cu @@ -16,6 +16,7 @@ #include "simple.cuh" +#include #include #include #include @@ -65,7 +66,8 @@ struct all_fn { cudf::dictionary::detail::make_dictionary_pair_iterator(*d_dict, input.has_nulls()); return thrust::make_transform_iterator(pair_iter, null_iter); }(); - auto d_result = rmm::device_scalar(1, stream, cudf::get_current_device_resource_ref()); + auto d_result = + cudf::detail::device_scalar(1, stream, cudf::get_current_device_resource_ref()); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu index 057f038c622..d70da369d72 100644 --- a/cpp/src/reductions/any.cu +++ b/cpp/src/reductions/any.cu @@ -16,6 +16,7 @@ #include "simple.cuh" +#include #include #include #include @@ -65,7 +66,8 @@ struct any_fn { cudf::dictionary::detail::make_dictionary_pair_iterator(*d_dict, input.has_nulls()); return thrust::make_transform_iterator(pair_iter, null_iter); }(); - auto d_result = rmm::device_scalar(0, stream, cudf::get_current_device_resource_ref()); + auto d_result = + cudf::detail::device_scalar(0, stream, cudf::get_current_device_resource_ref()); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh index 6bc8b48832f..cd9fade164a 100644 --- a/cpp/src/reductions/compound.cuh +++ b/cpp/src/reductions/compound.cuh @@ -18,13 +18,18 @@ #include #include +#include #include +#include #include #include #include #include +#include +#include + namespace cudf { namespace reduction { namespace compound { @@ -53,9 +58,17 @@ std::unique_ptr compound_reduction(column_view const& col, { auto const valid_count = col.size() - col.null_count(); + // All null input produces all null output + if (valid_count == 0 || + // Only care about ddof for standard deviation and variance right now + valid_count <= ddof && (std::is_same_v || + std::is_same_v)) { + auto result = cudf::make_fixed_width_scalar(output_dtype, stream, mr); + result->set_valid_async(false, stream); + return result; + } // reduction by iterator auto dcol = cudf::column_device_view::create(col, stream); - std::unique_ptr result; Op compound_op{}; if (!cudf::is_dictionary(col.type())) { @@ -63,25 +76,21 @@ std::unique_ptr compound_reduction(column_view const& col, auto it = thrust::make_transform_iterator( dcol->pair_begin(), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } else { auto it = thrust::make_transform_iterator( dcol->begin(), compound_op.template get_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } } else { auto it = thrust::make_transform_iterator( cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol, col.has_nulls()), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } - - // set scalar is valid - result->set_valid_async(col.null_count() < col.size(), stream); - return result; }; // @brief result type dispatcher for compound reduction (a.k.a. mean, var, std) @@ -137,6 +146,7 @@ struct element_type_dispatcher { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(ddof >= 0, "ddof must be non-negative", std::domain_error); return cudf::type_dispatcher( output_dtype, result_type_dispatcher(), col, output_dtype, ddof, stream, mr); } diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 362b5f74c46..b40b2b6dd2e 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -15,18 +15,24 @@ */ #include +#include #include -#include #include #include #include +#include #include +#include + +#include +#include #include #include #include #include #include +#include #include @@ -34,61 +40,12 @@ namespace cudf::reduction::detail { namespace { +// A CUDA Cooperative Group of 1 thread for the hash set for histogram +auto constexpr DEFAULT_HISTOGRAM_CG_SIZE = 1; + // Always use 64-bit signed integer for storing count. using histogram_count_type = int64_t; -/** - * @brief The functor to accumulate the frequency of each distinct rows in the input table. - */ -template -struct reduce_fn : cudf::detail::reduce_by_row_fn_base { - CountType const* d_partial_output; - - reduce_fn(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - CountType* const d_output, - CountType const* const d_partial_output) - : cudf::detail::reduce_by_row_fn_base{d_map, - d_hasher, - d_equal, - d_output}, - d_partial_output{d_partial_output} - { - } - - // Count the number of rows in each group of rows that are compared equal. - __device__ void operator()(size_type const idx) const - { - auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1}; - auto const count = - cuda::atomic_ref(*this->get_output_ptr(idx)); - count.fetch_add(increment, cuda::std::memory_order_relaxed); - } -}; - -/** - * @brief The builder to construct an instance of `reduce_fn` functor. - */ -template -struct reduce_func_builder { - CountType const* const d_partial_output; - - reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output} - { - } - - template - auto build(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - CountType* const d_output) - { - return reduce_fn{ - d_map, d_hasher, d_equal, d_output, d_partial_output}; - } -}; - /** * @brief Specialized functor to check for not-zero of the second component of the input. */ @@ -163,14 +120,6 @@ compute_row_frequencies(table_view const& input, "Nested types are not yet supported in histogram aggregation.", std::invalid_argument); - auto map = cudf::detail::hash_map_type{ - compute_hash_table_size(input.num_rows()), - cuco::empty_key{-1}, - cuco::empty_value{std::numeric_limits::min()}, - - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - auto const preprocessed_input = cudf::experimental::row::hash::preprocessed_table::create(input, stream); auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)}; @@ -179,51 +128,68 @@ compute_row_frequencies(table_view const& input, auto const key_hasher = row_hasher.device_hasher(has_nulls); auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - auto const pair_iter = cudf::detail::make_counting_transform_iterator( - size_type{0}, - cuda::proclaim_return_type>( - [] __device__(size_type const i) { return cuco::make_pair(i, i); })); - // Always compare NaNs as equal. using nan_equal_comparator = cudf::experimental::row::equality::nan_equal_physical_equality_comparator; auto const value_comp = nan_equal_comparator{}; + // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + + using row_hash = + cudf::experimental::row::hash::device_row_hasher; + + size_t const num_rows = input.num_rows(); + + // Construct a vector to store reduced counts and init to zero + rmm::device_uvector reduction_results(num_rows, stream, mr); + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), + reduction_results.begin(), + reduction_results.end(), + histogram_count_type{0}); + + // Construct a hash set + auto row_set = cuco::static_set{ + cuco::extent{num_rows}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, + cuco::empty_key{-1}, + key_equal, + cuco::linear_probing{key_hasher}, + {}, // thread scope + {}, // storage + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value()); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value()); - } - - // Gather the indices of distinct rows. - auto distinct_indices = std::make_unique>( - static_cast(map.get_size()), stream, mr); - - // Store the number of occurrences of each distinct row. - auto distinct_counts = make_numeric_column(data_type{type_to_id()}, - static_cast(map.get_size()), - mask_state::UNALLOCATED, - stream, - mr); + // Device-accessible reference to the hash set with `insert_and_find` operator + auto row_set_ref = row_set.ref(cuco::op::insert_and_find); // Compute frequencies (aka distinct counts) for the input rows. // Note that we consider null and NaNs as always equal. - auto const reduction_results = cudf::detail::hash_reduce_by_row( - map, - preprocessed_input, - input.num_rows(), - has_nulls, - has_nested_columns, - null_equality::EQUAL, - nan_equality::ALL_EQUAL, - reduce_func_builder{ - partial_counts ? partial_counts.value().begin() : nullptr}, - histogram_count_type{0}, - stream, - cudf::get_current_device_resource_ref()); - + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [set_ref = row_set_ref, + increments = + partial_counts.has_value() ? partial_counts.value().begin() : nullptr, + counts = reduction_results.begin()] __device__(auto const idx) mutable { + auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); + cuda::atomic_ref count_ref{ + counts[*inserted_idx_ptr]}; + auto const increment = increments ? increments[idx] : histogram_count_type{1}; + count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); + }); + + // Set-size is the number of distinct (inserted) rows + auto const set_size = row_set.size(stream); + + // Vector of distinct indices + auto distinct_indices = std::make_unique>(set_size, stream, mr); + // Column of distinct counts + auto distinct_counts = make_numeric_column( + data_type{type_to_id()}, set_size, mask_state::UNALLOCATED, stream, mr); + + // Copy row indices and counts to the output if counts are non-zero auto const input_it = thrust::make_zip_iterator( thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin())); auto const output_it = thrust::make_zip_iterator(thrust::make_tuple( @@ -232,7 +198,7 @@ compute_row_frequencies(table_view const& input, // Reduction results above are either group sizes of equal rows, or `0`. // The final output is non-zero group sizes only. thrust::copy_if( - rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{}); + rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{}); return {std::move(distinct_indices), std::move(distinct_counts)}; } diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index 139de068050..4f6eb23ce5b 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -69,18 +70,18 @@ struct minmax_pair { * @param num_items number of items to reduce * @param binary_op binary operator used to reduce * @param stream CUDA stream to run kernels on. - * @return rmm::device_scalar + * @return cudf::detail::device_scalar */ template ::type> -rmm::device_scalar reduce_device(InputIterator d_in, - size_type num_items, - Op binary_op, - rmm::cuda_stream_view stream) +auto reduce_device(InputIterator d_in, + size_type num_items, + Op binary_op, + rmm::cuda_stream_view stream) { OutputType identity{}; - rmm::device_scalar result{identity, stream}; + cudf::detail::device_scalar result{identity, stream}; // Allocate temporary storage size_t storage_bytes = 0; diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index d187375b69f..75ebc078930 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -26,8 +26,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index d3c0b54f286..b91ae19b51a 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -14,13 +14,10 @@ * limitations under the License. */ -#include #include #include #include #include -#include -#include namespace cudf { diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp index 40d1d8a0a53..c4f6c135dde 100644 --- a/cpp/src/reductions/segmented/reductions.cpp +++ b/cpp/src/reductions/segmented/reductions.cpp @@ -13,16 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include #include #include #include #include #include -#include #include -#include #include #include diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 1df1549432f..d0e3358cc34 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -137,7 +138,7 @@ struct replace_nulls_column_kernel_forwarder { auto device_out = cudf::mutable_column_device_view::create(output_view, stream); auto device_replacement = cudf::column_device_view::create(replacement, stream); - rmm::device_scalar valid_counter(0, stream); + cudf::detail::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); replace<<>>( diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 86ec8cfc91e..0cc97ca05e0 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -53,7 +54,6 @@ #include #include -#include #include #include @@ -182,7 +182,7 @@ struct replace_kernel_forwarder { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - rmm::device_scalar valid_counter(0, stream); + cudf::detail::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); auto replace = [&] { diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp index 72c23395a93..7cad31c0658 100644 --- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp +++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp @@ -18,13 +18,10 @@ #include #include #include -#include #include #include #include #include -#include -#include #include namespace cudf::detail { diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp index 8a53e937f98..77cb2a8c7f5 100644 --- a/cpp/src/rolling/detail/range_window_bounds.hpp +++ b/cpp/src/rolling/detail/range_window_bounds.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,7 @@ #pragma once #include -#include #include -#include -#include namespace cudf { namespace detail { diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 528700137bf..bc0ee2eb519 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,6 @@ #include #include -#include #include #include @@ -1105,7 +1105,7 @@ struct rolling_window_launcher { auto const d_inp_ptr = column_device_view::create(input, stream); auto const d_default_out_ptr = column_device_view::create(default_outputs, stream); auto const d_out_ptr = mutable_column_device_view::create(output->mutable_view(), stream); - auto d_valid_count = rmm::device_scalar{0, stream}; + auto d_valid_count = cudf::detail::device_scalar{0, stream}; auto constexpr block_size = 256; auto const grid = cudf::detail::grid_1d(input.size(), block_size); @@ -1271,7 +1271,7 @@ std::unique_ptr rolling_window_udf(column_view const& input, udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); auto output_view = output->mutable_view(); - rmm::device_scalar device_valid_count{0, stream}; + cudf::detail::device_scalar device_valid_count{0, stream}; std::string kernel_name = jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new") // diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp index 69792136c64..7f698dfcd6b 100644 --- a/cpp/src/rolling/range_window_bounds.cpp +++ b/cpp/src/rolling/range_window_bounds.cpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace cudf { namespace { diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 8988d73fb02..332c440aea9 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -358,10 +358,11 @@ std::unique_ptr round(column_view const& input, std::unique_ptr round(column_view const& input, int32_t decimal_places, rounding_method method, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round(input, decimal_places, method, cudf::get_default_stream(), mr); + return detail::round(input, decimal_places, method, stream, mr); } } // namespace cudf diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 31535198c58..4ec2174a96f 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -26,8 +26,6 @@ #include #include -#include - #include namespace cudf { diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp index 656fe61fbbe..9f242bdffe0 100644 --- a/cpp/src/scalar/scalar_factories.cpp +++ b/cpp/src/scalar/scalar_factories.cpp @@ -16,10 +16,8 @@ #include #include -#include #include #include -#include #include #include diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 4c015f3cbed..6a7c8ea45e9 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -348,7 +349,7 @@ std::unique_ptr convert_case(strings_column_view const& input, // This check incurs ~20% performance hit for smaller strings and so we only use it // after the threshold check above. The check makes very little impact for long strings // but results in a large performance gain when the input contains only single-byte characters. - rmm::device_scalar mb_count(0, stream); + cudf::detail::device_scalar mb_count(0, stream); // cudf::detail::grid_1d is limited to size_type elements auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size); // we only need to check every other byte since either will contain high bit diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 0db1adf1223..f5d052c6657 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -152,12 +153,8 @@ struct format_compiler { } // create program in device memory - d_items.resize(items.size(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(), - items.data(), - items.size() * sizeof(items[0]), - cudaMemcpyDefault, - stream.value())); + d_items = cudf::detail::make_device_uvector_sync( + items, stream, cudf::get_current_device_resource_ref()); } format_item const* compiled_format_items() { return d_items.data(); } diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 1d9d12686eb..9e4ef47ff79 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -242,7 +242,7 @@ std::unique_ptr concatenate(host_span columns, } { // Copy offsets columns with single kernel launch - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); constexpr size_type block_size{256}; cudf::detail::grid_1d config(offsets_count, block_size); diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index 7323918dcff..8683a9bdfbe 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -100,9 +100,8 @@ std::unique_ptr
extract(strings_column_view const& input, auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern"); - auto indices = rmm::device_uvector(input.size() * groups, stream); - auto d_indices = - cudf::detail::device_2dspan(indices.data(), input.size(), groups); + auto indices = rmm::device_uvector(input.size() * groups, stream); + auto d_indices = cudf::detail::device_2dspan(indices, groups); auto const d_strings = column_device_view::create(input.parent(), stream); diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 7c4c89bd3fb..b923a301f84 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -35,7 +35,7 @@ namespace strings { namespace detail { namespace { // Bitmask of all operators -#define OPERATOR_MASK 0200 +enum { OPERATOR_MASK = 0200 }; enum OperatorType : int32_t { START = 0200, // Start, used for marker on stack LBRA_NC = 0203, // non-capturing group @@ -50,7 +50,7 @@ enum OperatorType : int32_t { COUNTED_LAZY = 0215, NOP = 0302, // No operation, internal use only }; -#define ITEM_MASK 0300 +enum { ITEM_MASK = 0300 }; static reclass cclass_w(CCLASS_W); // \w static reclass cclass_s(CCLASS_S); // \s @@ -710,19 +710,17 @@ class regex_parser { std::stack lbra_stack; auto repeat_start_index = -1; - for (std::size_t index = 0; index < in.size(); index++) { - auto const item = in[index]; - + for (auto const item : in) { if (item.type != COUNTED && item.type != COUNTED_LAZY) { out.push_back(item); if (item.type == LBRA || item.type == LBRA_NC) { - lbra_stack.push(index); + lbra_stack.push(out.size() - 1); repeat_start_index = -1; } else if (item.type == RBRA) { repeat_start_index = lbra_stack.top(); lbra_stack.pop(); } else if ((item.type & ITEM_MASK) != OPERATOR_MASK) { - repeat_start_index = index; + repeat_start_index = out.size() - 1; } } else { // item is of type COUNTED or COUNTED_LAZY @@ -731,26 +729,39 @@ class regex_parser { CUDF_EXPECTS(repeat_start_index >= 0, "regex: invalid counted quantifier location"); // range of affected item(s) to repeat - auto const begin = in.begin() + repeat_start_index; - auto const end = in.begin() + index; + auto const begin = out.begin() + repeat_start_index; + auto const end = out.end(); + // count range values auto const n = item.d.count.n; // minimum count auto const m = item.d.count.m; // maximum count - assert(n >= 0 && "invalid repeat count value n"); // zero-repeat edge-case: need to erase the previous items - if (n == 0) { out.erase(out.end() - (index - repeat_start_index), out.end()); } - - // minimum repeats (n) - for (int j = 1; j < n; j++) { - out.insert(out.end(), begin, end); + if (n == 0) { out.erase(begin, end); } + + std::vector repeat_copy(begin, end); + // special handling for quantified capture groups + if ((n > 1) && (*begin).type == LBRA) { + (*begin).type = LBRA_NC; // change first one to non-capture + // add intermediate groups as non-capture + std::vector ncg_copy(begin, end); + for (int j = 1; j < (n - 1); j++) { + out.insert(out.end(), ncg_copy.begin(), ncg_copy.end()); + } + // add the last entry as a regular capture-group + out.insert(out.end(), repeat_copy.begin(), repeat_copy.end()); + } else { + // minimum repeats (n) + for (int j = 1; j < n; j++) { + out.insert(out.end(), repeat_copy.begin(), repeat_copy.end()); + } } // optional maximum repeats (m) if (m >= 0) { for (int j = n; j < m; j++) { out.emplace_back(LBRA_NC, 0); - out.insert(out.end(), begin, end); + out.insert(out.end(), repeat_copy.begin(), repeat_copy.end()); } for (int j = n; j < m; j++) { out.emplace_back(RBRA, 0); @@ -760,8 +771,9 @@ class regex_parser { // infinite repeats if (n > 0) { // append '+' after last repetition out.emplace_back(item.type == COUNTED ? PLUS : PLUS_LAZY, 0); - } else { // copy it once then append '*' - out.insert(out.end(), begin, end); + } else { + // copy it once then append '*' + out.insert(out.end(), repeat_copy.begin(), repeat_copy.end()); out.emplace_back(item.type == COUNTED ? STAR : STAR_LAZY, 0); } } diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp index d1990733e81..60ad714dfec 100644 --- a/cpp/src/strings/regex/regexec.cpp +++ b/cpp/src/strings/regex/regexec.cpp @@ -24,7 +24,6 @@ #include #include -#include #include #include diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu index 8a8001dd81a..957075017ba 100644 --- a/cpp/src/strings/replace/find_replace.cu +++ b/cpp/src/strings/replace/find_replace.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include #include @@ -21,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 352d883bdc5..88f343926c9 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -334,7 +334,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // Count the number of targets in the entire column. // Note this may over-count in the case where a target spans adjacent strings. - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); auto const num_blocks = util::div_rounding_up_safe( util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); count_targets<<>>(fn, chars_bytes, d_count.data()); diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 16df0dbabdf..52ddef76c1a 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -285,7 +285,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // Count the number of targets in the entire column. // Note this may over-count in the case where a target spans adjacent strings. - rmm::device_scalar d_target_count(0, stream); + cudf::detail::device_scalar d_target_count(0, stream); constexpr int64_t block_size = 512; constexpr size_type bytes_per_thread = 4; auto const num_blocks = util::div_rounding_up_safe( diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu new file mode 100644 index 00000000000..1183e3e4038 --- /dev/null +++ b/cpp/src/strings/search/contains_multiple.cu @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * a warp-parallel function is used. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + +/** + * @brief Kernel for finding multiple targets in each row of input strings + * + * The d_first_bytes is sorted and unique so the d_indices and d_offsets + * are used to map the corresponding character to its d_targets entry. + * + * Example + * d_targets = ["foo", "hello", "world", "hi"] + * - sorted first-chars: ['f','h','h','w'] + * d_indices = [0, 3, 1, 2] + * d_first_bytes = ['f', 'h', 'w'] (unique) + * d_offsets = [0, 1, 3] + * unique_count = 3 + * + * If 'h' is found, lower_bound produces pos=1 in d_first_bytes. + * This corresponds to d_offset[1]==1 which has two values: + * - (d_offsets[2] - d_offsets[1]) = (3 - 1) = 2. + * Set map_idx = d_offsets[1] = 1 and the two targets to check are sequential + * in the d_indices array: + * - tgt1_idx = d_indices[map_idx] = 3 --> d_targets[3] == 'hi' + * - tgt2_idx = d_indices[map_idx+1] = 1 --> d_targets[1] == 'hello' + * The logic now only needs to check for either of these 2 targets. + * + * This kernel works in either thread-per-string or warp-per-string depending + * on the template parameter. If tile_size==1, then this kernel executes as + * a row-per-string. If tile_size=32, the it executes as a warp-per-string. + * No other options are supported for now. + * + * @tparam tile_size Number of threads per string + * @param d_strings Input strings + * @param d_targets Target strings to search within input strings + * @param d_first_bytes Sorted, unique list of first bytes of the target strings + * @param d_indices Indices to map sorted d_first_bytes to d_targets + * @param d_offsets Offsets to map d_indices to d_targets + * @param unique_count Number of unique values in d_first_bytes (and d_offsets) + * @param working_memory Global memory to use if shared-memory is too small + * @param d_results Bool results for each target within each string row + */ +template +CUDF_KERNEL void multi_contains_kernel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + bool* working_memory, + cudf::device_span d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // get the string for this tile + auto const d_str = d_strings.element(str_idx); + + namespace cg = cooperative_groups; + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const lane_idx = tile.thread_rank(); + auto const num_targets = d_targets.size(); + + // size of shared_bools = num_targets * block_size + // each thread uses num_targets bools + extern __shared__ bool shared_bools[]; + // bools for the current string + auto bools = working_memory == nullptr + ? (shared_bools + (tile.meta_group_rank() * tile_size * num_targets)) + : (working_memory + (str_idx * tile_size * num_targets)); + + // initialize result: set true if target is empty, false otherwise + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const d_target = d_targets.element(target_idx); + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = d_target.empty(); + } else { + auto const begin = bools + (target_idx * tile_size); + thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, d_target.empty()); + } + } + tile.sync(); + + auto const last_ptr = d_first_bytes + unique_count; + for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); + str_byte_idx += tile_size) { + // search for byte in first_bytes array + auto const sptr = d_str.data() + str_byte_idx; + auto const chr = static_cast(*sptr); + auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); + // if not found, continue to next byte + if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } + // compute index of matched byte + auto const offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : num_targets; + // check for targets that begin with chr + while (map_idx < last_idx) { + auto const target_idx = d_indices[map_idx++]; + auto const bool_idx = (target_idx * tile_size) + lane_idx; + auto const found = tile_size == 1 ? d_results[target_idx][str_idx] : bools[bool_idx]; + if (!found) { // not found before + auto const d_target = d_targets.element(target_idx); + if ((d_str.size_bytes() - str_byte_idx) >= d_target.size_bytes()) { + // first char already checked, so just check the [1, end) chars match + auto const tp = d_target.data(); + if (thrust::equal(thrust::seq, tp + 1, tp + d_target.size_bytes(), sptr + 1)) { + if constexpr (tile_size == 1) { + d_results[target_idx][str_idx] = true; + } else { + bools[bool_idx] = true; + } + } + } + } + } + } + + if constexpr (tile_size > 1) { + tile.sync(); + // reduce the bools for each target to store in the result + for (auto target_idx = lane_idx; target_idx < num_targets; target_idx += tile_size) { + auto const begin = bools + (target_idx * tile_size); + d_results[target_idx][str_idx] = + thrust::any_of(thrust::seq, begin, begin + tile_size, thrust::identity{}); + // cooperative_group any() implementation was almost 3x slower than this parallel reduce + } + } +} +} // namespace + +std::unique_ptr
contains_multiple(strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS( + not targets.is_empty(), "Must specify at least one target string.", std::invalid_argument); + CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null", std::invalid_argument); + + auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_targets = column_device_view::create(targets.parent(), stream); + + // copy the first byte of each target and sort them + auto first_bytes = rmm::device_uvector(targets.size(), stream); + auto indices = rmm::device_uvector(targets.size(), stream); + { + auto tgt_itr = thrust::make_transform_iterator( + d_targets->begin(), + cuda::proclaim_return_type([] __device__(auto const& d_tgt) -> u_char { + return d_tgt.empty() ? u_char{0} : static_cast(d_tgt.data()[0]); + })); + auto count_itr = thrust::make_counting_iterator(0); + auto keys_out = first_bytes.begin(); + auto vals_out = indices.begin(); + auto num_items = targets.size(); + auto cmp_op = thrust::less(); + auto sv = stream.value(); + + std::size_t tmp_bytes = 0; + cub::DeviceMergeSort::SortPairsCopy( + nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + cub::DeviceMergeSort::SortPairsCopy( + tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_items, cmp_op, sv); + } + + // remove duplicates to help speed up lower_bound + auto offsets = rmm::device_uvector(targets.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end()); + auto const end = thrust::unique_by_key( + rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin()); + auto const unique_count = + static_cast(thrust::distance(first_bytes.begin(), end.first)); + + // create output columns + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + }); + auto results = std::vector>(results_iter, results_iter + targets.size()); + auto d_results = [&] { + auto host_results_pointer_iter = + thrust::make_transform_iterator(results.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + }(); + + constexpr cudf::thread_index_type block_size = 256; + // calculated (benchmarked) for efficient use of shared-memory + constexpr size_type targets_threshold = 32; + + auto d_first_bytes = first_bytes.data(); + auto d_indices = indices.data(); + auto d_offsets = offsets.data(); + + bool const row_parallel = ((input.null_count() == input.size()) || + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)); + + if (row_parallel) { + // Smaller strings perform better with a row per string + cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; + multi_contains_kernel<1> + <<>>(*d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + nullptr, + d_results); + } else { + constexpr cudf::thread_index_type tile_size = cudf::detail::warp_size; + + auto const shared_mem_size = + (targets.size() <= targets_threshold) ? (block_size * targets.size()) : 0; + auto const work_mem_size = + (targets.size() <= targets_threshold) ? 0 : tile_size * targets.size() * input.size(); + auto working_memory = rmm::device_uvector(work_mem_size, stream); + + cudf::detail::grid_1d grid{static_cast(input.size()) * tile_size, + block_size}; + multi_contains_kernel + <<>>( + *d_strings, + *d_targets, + d_first_bytes, + d_indices, + d_offsets, + unique_count, + working_memory.data(), + d_results); + } + + return std::make_unique
(std::move(results)); +} + +} // namespace detail + +std::unique_ptr
contains_multiple(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_multiple(strings, targets, stream, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index ec7015878dd..67226b259d4 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -42,8 +42,9 @@ std::unique_ptr find_multiple(strings_column_view const& input, { auto const strings_count = input.size(); auto const targets_count = targets.size(); - CUDF_EXPECTS(targets_count > 0, "Must include at least one search target"); - CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings"); + CUDF_EXPECTS(targets_count > 0, "Must include at least one search target", std::invalid_argument); + CUDF_EXPECTS( + !targets.has_nulls(), "Search targets cannot contain null strings", std::invalid_argument); auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 067a513af96..21708e48a25 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -97,8 +98,11 @@ std::unique_ptr findall(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const strings_count = input.size(); - auto const d_strings = column_device_view::create(input.parent(), stream); + if (input.is_empty()) { + return cudf::lists::detail::make_empty_lists_column(input.parent().type(), stream, mr); + } + + auto const d_strings = column_device_view::create(input.parent(), stream); // create device object from regex_program auto d_prog = regex_device_builder::create_prog_device(prog, stream); @@ -113,7 +117,7 @@ std::unique_ptr findall(strings_column_view const& input, auto strings_output = findall_util(*d_strings, *d_prog, total_matches, d_offsets, stream, mr); // Build the lists column from the offsets and the strings - return make_lists_column(strings_count, + return make_lists_column(input.size(), std::move(offsets), std::move(strings_output), input.null_count(), @@ -122,6 +126,43 @@ std::unique_ptr findall(strings_column_view const& input, mr); } +namespace { +struct find_re_fn { + column_device_view d_strings; + + __device__ size_type operator()(size_type const idx, + reprog_device const prog, + int32_t const thread_idx) const + { + if (d_strings.is_null(idx)) { return 0; } + auto const d_str = d_strings.element(idx); + + auto const result = prog.find(thread_idx, d_str, d_str.begin()); + return result.has_value() ? result.value().first : -1; + } +}; +} // namespace + +std::unique_ptr find_re(strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto results = make_numeric_column(data_type{type_to_id()}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + if (input.is_empty()) { return results; } + + auto d_results = results->mutable_view().data(); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); + auto const d_strings = column_device_view::create(input.parent(), stream); + launch_transform_kernel(find_re_fn{*d_strings}, *d_prog, d_results, input.size(), stream); + + return results; +} } // namespace detail // external API @@ -135,5 +176,14 @@ std::unique_ptr findall(strings_column_view const& input, return detail::findall(input, prog, stream, mr); } +std::unique_ptr find_re(strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::find_re(input, prog, stream, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 81aca001d53..4b777be9d5b 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -361,7 +362,7 @@ std::pair, rmm::device_uvector> split cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); // count the number of delimiters in the entire column - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); if (chars_bytes > 0) { constexpr int64_t block_size = 512; constexpr size_type bytes_per_thread = 4; diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 07516f91dcf..8e00a29f8e9 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -16,36 +16,171 @@ #include #include +#include #include +#include +#include #include -#include -#include #include -#include #include #include +#include #include +#include #include #include +#include +#include namespace cudf { +namespace strings::detail { + namespace { -struct string_view_to_pair { - string_view null_placeholder; - string_view_to_pair(string_view n) : null_placeholder(n) {} - __device__ thrust::pair operator()(string_view const& i) - { - return (i.data() == null_placeholder.data()) - ? thrust::pair{nullptr, 0} - : thrust::pair{i.data(), i.size_bytes()}; + +using column_string_pairs = cudf::device_span; + +template +std::pair>, rmm::device_uvector> +make_offsets_child_column_batch_async(std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_columns = input.size(); + std::vector> offsets_columns(num_columns); + rmm::device_uvector chars_sizes(num_columns, stream); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const string_pairs = input[idx]; + auto const string_count = static_cast(string_pairs.size()); + auto offsets = make_numeric_column( + data_type{type_to_id()}, string_count + 1, mask_state::UNALLOCATED, stream, mr); + + auto const offsets_transformer = cuda::proclaim_return_type( + [string_count, string_pairs = string_pairs.data()] __device__(size_type idx) -> size_type { + return idx < string_count ? string_pairs[idx].second : size_type{0}; + }); + auto const input_it = cudf::detail::make_counting_transform_iterator(0, offsets_transformer); + auto const d_offsets = offsets->mutable_view().template data(); + auto const output_it = cudf::detail::make_sizes_to_offsets_iterator( + d_offsets, d_offsets + string_count + 1, chars_sizes.data() + idx); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + input_it, + input_it + string_count + 1, + output_it, + int64_t{0}); + offsets_columns[idx] = std::move(offsets); } -}; + + return {std::move(offsets_columns), std::move(chars_sizes)}; +} } // namespace +std::vector> make_strings_column_batch( + std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_columns = input.size(); + + auto [offsets_cols, d_chars_sizes] = + make_offsets_child_column_batch_async(input, stream, mr); + + std::vector null_masks; + null_masks.reserve(num_columns); + + rmm::device_uvector d_valid_counts(num_columns, stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0); + + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const& string_pairs = input[idx]; + auto const string_count = static_cast(string_pairs.size()); + null_masks.emplace_back( + cudf::create_null_mask(string_count, mask_state::UNINITIALIZED, stream, mr)); + + if (string_count == 0) { continue; } + + constexpr size_type block_size{256}; + auto const grid = + cudf::detail::grid_1d{static_cast(string_count), block_size}; + cudf::detail::valid_if_kernel + <<>>( + reinterpret_cast(null_masks.back().data()), + string_pairs.data(), + string_count, + [] __device__(string_index_pair const pair) -> bool { return pair.first != nullptr; }, + d_valid_counts.data() + idx); + } + + auto const chars_sizes = cudf::detail::make_std_vector_async(d_chars_sizes, stream); + auto const valid_counts = cudf::detail::make_std_vector_async(d_valid_counts, stream); + + // Except for other stream syncs in `CUB` that we cannot control, + // this should be the only stream sync we need in the entire API. + stream.synchronize(); + + auto const threshold = cudf::strings::get_offset64_threshold(); + auto const overflow_count = + std::count_if(chars_sizes.begin(), chars_sizes.end(), [threshold](auto const chars_size) { + return chars_size >= threshold; + }); + CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || overflow_count == 0, + "Size of output exceeds the column size limit", + std::overflow_error); + + if (overflow_count > 0) { + std::vector long_string_input; + std::vector long_string_col_idx; + long_string_input.reserve(overflow_count); + long_string_col_idx.reserve(overflow_count); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + if (chars_sizes[idx] >= threshold) { + long_string_input.push_back(input[idx]); + long_string_col_idx.push_back(idx); + } + } + + [[maybe_unused]] auto [new_offsets_cols, d_new_chars_sizes] = + make_offsets_child_column_batch_async(long_string_input, stream, mr); + + // Update the new offsets columns. + // The new chars sizes should be the same as before, thus we don't need to update them. + for (std::size_t idx = 0; idx < long_string_col_idx.size(); ++idx) { + offsets_cols[long_string_col_idx[idx]] = std::move(new_offsets_cols[idx]); + } + } + + std::vector> output(num_columns); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const strings_count = static_cast(input[idx].size()); + if (strings_count == 0) { + output[idx] = make_empty_column(type_id::STRING); + continue; + } + + auto const chars_size = chars_sizes[idx]; + auto const valid_count = valid_counts[idx]; + + auto chars_data = make_chars_buffer( + offsets_cols[idx]->view(), chars_size, input[idx].data(), strings_count, stream, mr); + + auto const null_count = strings_count - valid_count; + output[idx] = make_strings_column( + strings_count, + std::move(offsets_cols[idx]), + chars_data.release(), + null_count, + null_count ? std::move(null_masks[idx]) : rmm::device_buffer{0, stream, mr}); + } + + return output; +} + +} // namespace strings::detail + // Create a strings-type column from vector of pointer/size pairs std::unique_ptr make_strings_column( device_span const> strings, @@ -53,10 +188,32 @@ std::unique_ptr make_strings_column( rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr); } +std::vector> make_strings_column_batch( + std::vector const>> const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return cudf::strings::detail::make_strings_column_batch(input, stream, mr); +} + +namespace { +struct string_view_to_pair { + string_view null_placeholder; + string_view_to_pair(string_view n) : null_placeholder(n) {} + __device__ thrust::pair operator()(string_view const& i) + { + return (i.data() == null_placeholder.data()) + ? thrust::pair{nullptr, 0} + : thrust::pair{i.data(), i.size_bytes()}; + } +}; + +} // namespace + std::unique_ptr make_strings_column(device_span string_views, string_view null_placeholder, rmm::cuda_stream_view stream, diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp index 219d1174d42..1cc405234b2 100644 --- a/cpp/src/strings/strings_scalar_factories.cpp +++ b/cpp/src/strings/strings_scalar_factories.cpp @@ -16,7 +16,6 @@ #include #include -#include #include diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp index b0284e9cb96..e14142a9ad1 100644 --- a/cpp/src/structs/structs_column_view.cpp +++ b/cpp/src/structs/structs_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 5df9943303d..4012ee3d21c 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -21,13 +21,10 @@ #include #include #include -#include #include #include #include -#include #include -#include #include #include diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp index cb707c94288..41c64c6decb 100644 --- a/cpp/src/table/table.cpp +++ b/cpp/src/table/table.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp index 8a5340dc20d..659beb749af 100644 --- a/cpp/src/table/table_view.cpp +++ b/cpp/src/table/table_view.cpp @@ -20,10 +20,7 @@ #include #include -#include - #include -#include #include namespace cudf { diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index a87ecb81b9d..997b0278fe2 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,9 @@ namespace nvtext { namespace detail { namespace { +// long strings threshold found with benchmarking +constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64; + /** * @brief Generate ngrams from strings column. * @@ -173,33 +177,39 @@ constexpr cudf::thread_index_type bytes_per_thread = 4; /** * @brief Counts the number of ngrams in each row of the given strings column * - * Each warp processes a single string. + * Each warp/thread processes a single string. * Formula is `count = max(0,str.length() - ngrams + 1)` * If a string has less than ngrams characters, its count is 0. */ CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings, cudf::size_type ngrams, + cudf::size_type tile_size, cudf::size_type* d_counts) { auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / tile_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { d_counts[str_idx] = 0; return; } + auto const d_str = d_strings.element(str_idx); + if (tile_size == 1) { + d_counts[str_idx] = cuda::std::max(0, (d_str.length() + 1 - ngrams)); + return; + } + namespace cg = cooperative_groups; auto const warp = cg::tiled_partition(cg::this_thread_block()); - auto const d_str = d_strings.element(str_idx); - auto const end = d_str.data() + d_str.size_bytes(); + auto const end = d_str.data() + d_str.size_bytes(); auto const lane_idx = warp.thread_rank(); cudf::size_type count = 0; for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end; - itr += cudf::detail::warp_size * bytes_per_thread) { + itr += tile_size * bytes_per_thread) { for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) { count += static_cast(cudf::strings::detail::is_begin_utf8_char(*s)); } @@ -256,19 +266,27 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Parameter ngrams should be an integer value of 2 or greater", std::invalid_argument); - auto const strings_count = input.size(); - if (strings_count == 0) { // if no strings, return an empty column - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + if (input.is_empty()) { // if no strings, return an empty column + return cudf::lists::detail::make_empty_lists_column( + cudf::data_type{cudf::type_id::STRING}, stream, mr); + } + if (input.size() == input.null_count()) { + return cudf::lists::detail::make_all_nulls_lists_column( + input.size(), cudf::data_type{cudf::type_id::STRING}, stream, mr); } auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto [offsets, total_ngrams] = [&] { - auto counts = rmm::device_uvector(input.size(), stream); - auto const num_blocks = cudf::util::div_rounding_up_safe( - static_cast(input.size()) * cudf::detail::warp_size, block_size); - count_char_ngrams_kernel<<>>( - *d_strings, ngrams, counts.data()); + auto counts = rmm::device_uvector(input.size(), stream); + auto const avg_char_bytes = (input.chars_size(stream) / (input.size() - input.null_count())); + auto const tile_size = (avg_char_bytes < AVG_CHAR_BYTES_THRESHOLD) + ? 1 // thread per row + : cudf::detail::warp_size; // warp per row + auto const grid = cudf::detail::grid_1d( + static_cast(input.size()) * tile_size, block_size); + count_char_ngrams_kernel<<>>( + *d_strings, ngrams, tile_size, counts.data()); return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); }(); auto d_offsets = offsets->view().data(); @@ -277,8 +295,8 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Insufficient number of characters in each string to generate ngrams"); character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets}; - auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - generator, strings_count, total_ngrams, stream, mr); + auto [offsets_column, chars] = + cudf::strings::detail::make_strings_children(generator, input.size(), total_ngrams, stream, mr); auto output = cudf::make_strings_column( total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); @@ -368,7 +386,7 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co auto [offsets, total_ngrams] = [&] { auto counts = rmm::device_uvector(input.size(), stream); count_char_ngrams_kernel<<>>( - *d_strings, ngrams, counts.data()); + *d_strings, ngrams, cudf::detail::warp_size, counts.data()); return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); }(); auto d_offsets = offsets->view().data(); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index a03a34f5fa7..aee83ab35ed 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -37,9 +38,13 @@ #include #include +#include #include +#include #include #include +#include +#include #include @@ -162,6 +167,339 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return hashes; } +constexpr cudf::thread_index_type block_size = 256; +// for potentially tuning minhash_seed_kernel independently from block_size +constexpr cudf::thread_index_type tile_size = block_size; + +// Number of a/b parameter values to process per thread. +// The intermediate values are stored in shared-memory and therefore limits this count. +// This value was found to be an efficient size for both uint32 and uint64 +// hash types based on benchmarks. +constexpr cuda::std::size_t params_per_thread = 16; + +// Separate kernels are used to process strings above and below this value (in bytes). +constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K +// The number of blocks per string for the above-threshold kernel processing. +constexpr cudf::size_type blocks_per_string = 64; +// The above values were determined using the redpajama and books_sample datasets + +/** + * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * + * This kernel computes the hashes for each string using the seed and the specified + * hash function. The width is used to compute rolling substrings to hash over. + * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel. + * + * This kernel also counts the number of strings above the wide_string_threshold + * and proactively initializes the output values for those strings. + * + * @tparam HashFunction The hash function to use for this kernel + * @tparam hash_value_type Derived from HashFunction result_type + * + * @param d_strings The input strings to hash + * @param seed The seed used for the hash function + * @param width Width in characters used for determining substrings to hash + * @param d_hashes The resulting hash values are stored here + * @param threshold_count Stores the number of strings above wide_string_threshold + * @param param_count Number of parameters (used for the proactive initialize) + * @param d_results Final results vector (used for the proactive initialize) + */ +template +CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, + hash_value_type seed, + cudf::size_type width, + hash_value_type* d_hashes, + cudf::size_type* threshold_count, + cudf::size_type param_count, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = tid / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // retrieve this string's offset to locate the output position in d_hashes + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + if (size_bytes == 0) { return; } + + auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); + auto const lane_idx = tid % tile_size; + + // hashes for this string/thread are stored here + auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; + + auto const begin = d_str.data() + lane_idx; + auto const end = d_str.data() + d_str.size_bytes(); + auto const hasher = HashFunction(seed); + + for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { + *seed_hashes = 0; + continue; + } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { + // true itr+width is past the end of the string + *seed_hashes = 0; + continue; + } + + auto const hash_str = cudf::string_view(itr, bytes); + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + hv = thrust::get<0>(hasher(hash_str)); + } + // disallowing hash to zero case + *seed_hashes = cuda::std::max(hv, hash_value_type{1}); + } + + // logic appended here so an extra kernel is not required + if (size_bytes >= wide_string_threshold) { + if (lane_idx == 0) { + // count the number of wide strings + cuda::atomic_ref ref{*threshold_count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + // initialize the output -- only needed for wider strings + auto d_output = d_results + (str_idx * param_count); + for (auto i = lane_idx; i < param_count; i += tile_size) { + d_output[i] = std::numeric_limits::max(); + } + } +} + +/** + * @brief Permutation calculation kernel + * + * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and + * parameter_b values to compute the final output results. + * The output is the number of input rows (N) by the number of parameter values (M). + * Each output[i] is the calculated result for parameter_a/b[0:M]. + * + * This kernel is launched with either blocks per strings of 1 for strings + * below the wide_strings_threshold or blocks per string = blocks_per_strings + * for strings above wide_strings_threshold. + * + * @tparam hash_value_type Derived from HashFunction result_type + * @tparam blocks_per_string Number of blocks used to process each string + * + * @param d_strings The input strings to hash + * @param indices The indices of the strings in d_strings to process + * @param parameter_a 1st set of parameters for the calculation result + * @param parameter_b 2nd set of parameters for the calculation result + * @param width Used for calculating the number of available hashes in each string + * @param d_hashes The hash values computed in minhash_seed_kernel + * @param d_results Final results vector of calculate values + */ +template +CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + hash_value_type const* d_hashes, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const idx = (tid / blocks_per_string) / block_size; + if (idx >= indices.size()) { return; } + auto const str_idx = indices[idx]; + if (d_strings.is_null(str_idx)) { return; } + + auto const block = cooperative_groups::this_thread_block(); + int const section_idx = block.group_index().x % blocks_per_string; + + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + + // number of items to process in this block; + // last block also includes any remainder values from the size_bytes/blocks_per_string truncation + // example: + // each section_size for string with size 588090 and blocks_per_string=64 is 9188 + // except the last section which is 9188 + (588090 % 64) = 9246 + auto const section_size = + (size_bytes / blocks_per_string) + + (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); + auto const section_offset = section_idx * (size_bytes / blocks_per_string); + + // hash values for this block/section + auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; + // width used here as a max value since a string's char-count <= byte-count + auto const hashes_size = + section_idx < (blocks_per_string - 1) + ? section_size + : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); + + auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); + auto const lane_idx = block.thread_rank(); + auto const d_output = d_results + (str_idx * parameter_a.size()); + + auto const begin = seed_hashes + lane_idx; + auto const end = seed_hashes + hashes_size; + + // constants used in the permutation calculations + constexpr uint64_t mersenne_prime = (1UL << 61) - 1; + constexpr hash_value_type hash_max = std::numeric_limits::max(); + + // found to be an efficient shared memory size for both hash types + __shared__ hash_value_type block_values[block_size * params_per_thread]; + + for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { + // initialize this block's chunk of shared memory + // each thread handles params_per_thread of values + auto const chunk_values = block_values + (lane_idx * params_per_thread); + thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init); + block.sync(); + + auto const param_count = + cuda::std::min(static_cast(params_per_thread), parameter_a.size() - i); + + // each lane accumulates min hashes in its shared memory + for (auto itr = begin; itr < end; itr += block_size) { + auto const hv = *itr; + // 0 is used as a skip sentinel for UTF-8 and trailing bytes + if (hv == 0) { continue; } + + for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { + // permutation formula used by datatrove + hash_value_type const v = + ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((param_idx % params_per_thread) * block_size) + lane_idx; + block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); + } + } + block.sync(); + + // reduce each parameter values vector to a single min value; + // assumes that the block_size > params_per_thread; + // each thread reduces a block_size of parameter values (thread per parameter) + if (lane_idx < param_count) { + auto const values = block_values + (lane_idx * block_size); + // cooperative groups does not have a min function and cub::BlockReduce was slower + auto const minv = + thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); + if constexpr (blocks_per_string > 1) { + // accumulates mins for each block into d_output + cuda::atomic_ref ref{d_output[lane_idx + i]}; + ref.fetch_min(minv, cuda::std::memory_order_relaxed); + } else { + d_output[lane_idx + i] = minv; + } + } + block.sync(); + } +} + +template +std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + hash_value_type seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(width >= 2, + "Parameter width should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + CUDF_EXPECTS( + (static_cast(input.size()) * parameter_a.size()) < + static_cast(std::numeric_limits::max()), + "The number of parameters times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto results = + cudf::make_numeric_column(output_type, + input.size() * static_cast(parameter_a.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_results = results->mutable_view().data(); + + cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, + block_size}; + auto const hashes_size = input.chars_size(stream); + auto d_hashes = rmm::device_uvector(hashes_size, stream); + auto d_threshold_count = cudf::detail::device_scalar(0, stream); + + minhash_seed_kernel + <<>>(*d_strings, + seed, + width, + d_hashes.data(), + d_threshold_count.data(), + parameter_a.size(), + d_results); + auto const threshold_count = d_threshold_count.value(stream); + + auto indices = rmm::device_uvector(input.size(), stream); + thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); + cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0; + + // if we counted a split of above/below threshold then + // compute partitions based on the size of each string + if ((threshold_count > 0) && (threshold_count < input.size())) { + auto sizes = rmm::device_uvector(input.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + sizes.data(), + cuda::proclaim_return_type( + [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { + if (d_strings.is_null(idx)) { return 0; } + return d_strings.element(idx).size_bytes(); + })); + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); + auto const lb = thrust::lower_bound( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); + threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + } + + // handle the strings below the threshold width + if (threshold_index > 0) { + auto d_indices = cudf::device_span(indices.data(), threshold_index); + cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, + block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + // handle the strings above the threshold width + if (threshold_index < input.size()) { + auto const count = static_cast(input.size() - threshold_index); + auto d_indices = + cudf::device_span(indices.data() + threshold_index, count); + cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + return results; +} + /** * @brief Compute the minhash of each list row of strings for each seed * @@ -309,6 +647,20 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar const& seed, cudf::size_type width, @@ -333,6 +685,20 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash64(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, @@ -374,6 +740,18 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return detail::minhash(input, seeds, width, stream, mr); } +std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, @@ -394,6 +772,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index eca703e2604..b13ad0a7de8 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -289,10 +289,12 @@ std::unique_ptr load_vocabulary_file( } // namespace detail std::unique_ptr load_vocabulary_file( - std::string const& filename_hashed_vocabulary, rmm::device_async_resource_ref mr) + std::string const& filename_hashed_vocabulary, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr); + return detail::load_vocabulary_file(filename_hashed_vocabulary, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index d7e04a0c208..dee589d6daf 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -293,17 +293,12 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, uint32_t stride, bool do_lower_case, bool do_truncate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::subword_tokenize(strings, - vocabulary_table, - max_sequence_length, - stride, - do_lower_case, - do_truncate, - cudf::get_default_stream(), - mr); + return detail::subword_tokenize( + strings, vocabulary_table, max_sequence_length, stride, do_lower_case, do_truncate, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index df25950e6d5..89ca8a089d6 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -221,7 +222,7 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // To minimize memory, count the number of characters so we can // build the output offsets without an intermediate buffer. // In the worst case each byte is a character so the output is 4x the input. - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); auto const num_blocks = cudf::util::div_rounding_up_safe( cudf::util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 52b96bc9039..b919ac16956 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -23,8 +23,6 @@ #include #include #include -#include -#include #include #include diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu index 810fd8afd73..c9872d3ef09 100644 --- a/cpp/src/transpose/transpose.cu +++ b/cpp/src/transpose/transpose.cu @@ -61,10 +61,11 @@ std::pair, table_view> transpose(table_view const& input } // namespace detail std::pair, table_view> transpose(table_view const& input, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::transpose(input, cudf::get_default_stream(), mr); + return detail::transpose(input, stream, mr); } } // namespace cudf diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp index 53ca0608170..d979bda41d0 100644 --- a/cpp/src/utilities/cuda.cpp +++ b/cpp/src/utilities/cuda.cpp @@ -18,8 +18,6 @@ #include #include -#include - namespace cudf::detail { cudf::size_type num_multiprocessors() diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 0efb881eb3e..c0af27a1748 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -30,7 +30,7 @@ namespace cudf::detail { namespace { // Simple kernel to copy between device buffers -CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n) +CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n) { auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx < n) { dst[idx] = src[idx]; } @@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea }; // namespace -void cuda_memcpy_async( +void cuda_memcpy_async_impl( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { if (kind == host_memory_kind::PINNED) { @@ -73,11 +73,4 @@ void cuda_memcpy_async( } } -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) -{ - cuda_memcpy_async(dst, src, size, kind, stream); - stream.synchronize(); -} - } // namespace cudf::detail diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 125b98c4a67..e30806a5011 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -115,12 +114,19 @@ class fixed_pinned_pool_memory_resource { return !operator==(other); } - friend void get_property(fixed_pinned_pool_memory_resource const&, + // clang-tidy will complain about this function because it is completely + // unused at runtime and only exist for tag introspection by CCCL, so we + // ignore linting. This masks a real issue if we ever want to compile with + // clang, though, which is that the function will actually be compiled out by + // clang. If cudf were ever to try to support clang as a compile we would + // need to force the compiler to emit this symbol. The same goes for the + // other get_property definitions in this file. + friend void get_property(fixed_pinned_pool_memory_resource const&, // NOLINT cuda::mr::device_accessible) noexcept { } - friend void get_property(fixed_pinned_pool_memory_resource const&, + friend void get_property(fixed_pinned_pool_memory_resource const&, // NOLINT cuda::mr::host_accessible) noexcept { } @@ -235,7 +241,9 @@ class new_delete_memory_resource { bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); } + // NOLINTBEGIN friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {} + // NOLINTEND }; static_assert(cuda::mr::resource_with, diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp index d54f5677c4c..e52fffbd8c6 100644 --- a/cpp/src/utilities/logger.cpp +++ b/cpp/src/utilities/logger.cpp @@ -74,8 +74,10 @@ struct logger_wrapper { } // namespace -spdlog::logger& cudf::logger() +spdlog::logger& cudf::detail::logger() { static logger_wrapper wrapped{}; return wrapped.logger_; } + +spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp index 58971552758..000526723c4 100644 --- a/cpp/src/utilities/prefetch.cpp +++ b/cpp/src/utilities/prefetch.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9824c472b20..7069b59be26 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -23,7 +23,6 @@ #include #include -#include #include #include @@ -82,7 +81,7 @@ class rmm_cuda_stream_pool : public cuda_stream_pool { return streams; } - std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; } + [[nodiscard]] std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; } }; /** diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp index a68dc84e340..41ee4e960b6 100644 --- a/cpp/src/utilities/traits.cpp +++ b/cpp/src/utilities/traits.cpp @@ -19,8 +19,6 @@ #include #include -#include - namespace cudf { namespace { @@ -171,6 +169,19 @@ bool is_integral_not_bool(data_type type) return cudf::type_dispatcher(type, is_integral_not_bool_impl{}); } +struct is_numeric_not_bool_impl { + template + constexpr bool operator()() + { + return is_numeric_not_bool(); + } +}; + +bool is_numeric_not_bool(data_type type) +{ + return cudf::type_dispatcher(type, is_numeric_not_bool_impl{}); +} + struct is_floating_point_impl { template constexpr bool operator()() diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp index 3095b342748..84c8529641d 100644 --- a/cpp/src/utilities/type_checks.cpp +++ b/cpp/src/utilities/type_checks.cpp @@ -21,8 +21,6 @@ #include #include -#include - #include namespace cudf { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b67d922d377..666a7d4ba4b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -56,8 +56,15 @@ function(ConfigureTest CMAKE_TEST_NAME) target_link_libraries( ${CMAKE_TEST_NAME} - PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main - nvtx3::nvtx3-cpp $ "${_CUDF_TEST_EXTRA_LIBS}" + PRIVATE cudf::cudftestutil + cudf::cudftestutil_impl + GTest::gmock + GTest::gmock_main + GTest::gtest + GTest::gtest_main + nvtx3::nvtx3-cpp + $ + "${_CUDF_TEST_EXTRA_LIBS}" ) rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME}) rapids_test_add( @@ -385,6 +392,8 @@ ConfigureTest( # * utilities tests ------------------------------------------------------------------------------- ConfigureTest( UTILITIES_TEST + utilities_tests/batched_memcpy_tests.cu + utilities_tests/batched_memset_tests.cu utilities_tests/column_debug_tests.cpp utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp @@ -395,7 +404,6 @@ ConfigureTest( utilities_tests/pinned_memory_tests.cpp utilities_tests/type_check_tests.cpp utilities_tests/type_list_tests.cpp - utilities_tests/batched_memset_tests.cu ) # ################################################################################################## @@ -642,7 +650,7 @@ ConfigureTest(ENCODE_TEST encode/encode_tests.cpp) # ################################################################################################## # * ast tests ------------------------------------------------------------------------------------- -ConfigureTest(AST_TEST ast/transform_tests.cpp) +ConfigureTest(AST_TEST ast/transform_tests.cpp ast/ast_tree_tests.cpp) # ################################################################################################## # * lists tests ---------------------------------------------------------------------------------- @@ -702,11 +710,14 @@ ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp ST ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) +ConfigureTest(STREAM_QUANTILE_TEST streams/quantile_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_ROUND_TEST streams/round_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing) @@ -717,6 +728,7 @@ ConfigureTest( streams/strings/contains_test.cpp streams/strings/convert_test.cpp streams/strings/extract_test.cpp + streams/strings/factory_test.cpp streams/strings/filter_test.cpp streams/strings/find_test.cpp streams/strings/replace_test.cpp @@ -732,11 +744,13 @@ ConfigureTest( streams/text/ngrams_test.cpp streams/text/replace_test.cpp streams/text/stemmer_test.cpp + streams/text/subword_tokenize_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing ) ConfigureTest(STREAM_TRANSFORM_TEST streams/transform_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_TRANSPOSE_TEST streams/transpose_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing) # ################################################################################################## diff --git a/cpp/tests/ast/ast_tree_tests.cpp b/cpp/tests/ast/ast_tree_tests.cpp new file mode 100644 index 00000000000..1a960c68e23 --- /dev/null +++ b/cpp/tests/ast/ast_tree_tests.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +template +using column_wrapper = cudf::test::fixed_width_column_wrapper; + +TEST(AstTreeTest, ExpressionTree) +{ + namespace ast = cudf::ast; + using op = ast::ast_operator; + using operation = ast::operation; + + // computes (y = mx + c)... and linearly interpolates them using interpolator t + auto m0_col = column_wrapper{10, 20, 50, 100}; + auto x0_col = column_wrapper{10, 5, 2, 1}; + auto c0_col = column_wrapper{100, 100, 100, 100}; + + auto m1_col = column_wrapper{10, 20, 50, 100}; + auto x1_col = column_wrapper{20, 10, 4, 2}; + auto c1_col = column_wrapper{200, 200, 200, 200}; + + auto one_scalar = cudf::numeric_scalar{1}; + auto t_scalar = cudf::numeric_scalar{0.5F}; + + auto table = cudf::table_view{{m0_col, x0_col, c0_col, m1_col, x1_col, c1_col}}; + + ast::tree tree{}; + + auto const& one = tree.push(ast::literal{one_scalar}); + auto const& t = tree.push(ast::literal{t_scalar}); + auto const& m0 = tree.push(ast::column_reference(0)); + auto const& x0 = tree.push(ast::column_reference(1)); + auto const& c0 = tree.push(ast::column_reference(2)); + auto const& m1 = tree.push(ast::column_reference(3)); + auto const& x1 = tree.push(ast::column_reference(4)); + auto const& c1 = tree.push(ast::column_reference(5)); + + // compute: y0 = m0 x0 + c0 + auto const& y0 = tree.push(operation{op::ADD, tree.push(operation{op::MUL, m0, x0}), c0}); + + // compute: y1 = m1 x1 + c1 + auto const& y1 = tree.push(operation{op::ADD, tree.push(operation{op::MUL, m1, x1}), c1}); + + // compute weighted: (1 - t) * y0 + auto const& y0_w = tree.push(operation{op::MUL, tree.push(operation{op::SUB, one, t}), y0}); + + // compute weighted: y = t * y1 + auto const& y1_w = tree.push(operation{op::MUL, t, y1}); + + // add weighted: result = lerp(y0, y1, t) = (1 - t) * y0 + t * y1 + auto result = cudf::compute_column(table, tree.push(operation{op::ADD, y0_w, y1_w})); + + auto expected = column_wrapper{300, 300, 300, 300}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 6b350c137d0..995f7748167 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -26,14 +25,8 @@ #include #include #include -#include -#include -#include #include #include -#include - -#include #include @@ -41,7 +34,6 @@ #include #include #include -#include #include template @@ -368,7 +360,7 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression) constexpr int64_t right_depth_level = 75; auto generate_ast_expr = [](int64_t depth_level, - cudf::ast::column_reference col_ref, + cudf::ast::column_reference const& col_ref, cudf::ast::ast_operator root_operator, cudf::ast::ast_operator arithmetic_operator, bool nested_left_tree) { @@ -378,7 +370,7 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression) auto expressions = std::list(); auto op = arithmetic_operator; - expressions.push_back(cudf::ast::operation(op, col_ref, col_ref)); + expressions.emplace_back(op, col_ref, col_ref); for (int64_t i = 0; i < depth_level - 1; i++) { if (i == depth_level - 2) { @@ -387,9 +379,9 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression) op = arithmetic_operator; } if (nested_left_tree) { - expressions.push_back(cudf::ast::operation(op, expressions.back(), col_ref)); + expressions.emplace_back(op, expressions.back(), col_ref); } else { - expressions.push_back(cudf::ast::operation(op, col_ref, expressions.back())); + expressions.emplace_back(op, col_ref, expressions.back()); } } return expressions; @@ -538,9 +530,10 @@ TEST_F(TransformTest, UnaryTrigonometry) TEST_F(TransformTest, ArityCheckFailure) { auto col_ref_0 = cudf::ast::column_reference(0); - EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0), cudf::logic_error); + EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0), + std::invalid_argument); EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ABS, col_ref_0, col_ref_0), - cudf::logic_error); + std::invalid_argument); } TEST_F(TransformTest, StringComparison) diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index 06e0d193d80..7cdce1ff735 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -23,12 +23,12 @@ #include #include +#include #include #include #include -#include +#include #include -#include #include @@ -557,7 +557,11 @@ auto NullOp_Result(cudf::column_view lhs, cudf::column_view rhs) std::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(lhs.size()), result.begin(), - [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut { + [&lhs_data = lhs_data, + &lhs_mask = lhs_mask, + &rhs_data = rhs_data, + &rhs_mask = rhs_mask, + &result_mask = result_mask](auto i) -> TypeOut { auto lhs_valid = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i); auto rhs_valid = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i); bool output_valid = lhs_valid or rhs_valid; @@ -818,4 +822,24 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } +TEST(BinaryOperationCompiledTest, LargeColumnNoOverflow) +{ + cudf::size_type num_rows{1'799'989'091}; + auto big = cudf::make_column_from_scalar( + cudf::numeric_scalar>{10, true}, num_rows); + auto small = cudf::make_column_from_scalar( + cudf::numeric_scalar>{1, true}, num_rows); + + auto mask = cudf::binary_operation(big->view(), + small->view(), + cudf::binary_operator::GREATER, + cudf::data_type{cudf::type_id::BOOL8}); + + auto agg = cudf::make_sum_aggregation(); + auto result = + cudf::reduce(mask->view(), *agg, cudf::data_type{cudf::type_to_id()}); + auto got = static_cast*>(result.get())->value(); + EXPECT_EQ(num_rows, got); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp index 03cc87a1968..e9a2761db4a 100644 --- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp +++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h index c900c4c558c..ef1ccfccab5 100644 --- a/cpp/tests/binaryop/util/operation.h +++ b/cpp/tests/binaryop/util/operation.h @@ -48,7 +48,7 @@ struct Add { void>* = nullptr> OutT operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) + static_cast(rhs)); } }; @@ -72,7 +72,7 @@ struct Sub { void>* = nullptr> OutT operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) - static_cast(rhs)); } }; @@ -83,7 +83,7 @@ struct Mul { std::enable_if_t::value, void>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) * static_cast(rhs)); } @@ -100,7 +100,7 @@ struct Mul { std::enable_if_t<(cudf::is_duration_t::value && std::is_integral_v) || (cudf::is_duration_t::value && std::is_integral_v), void>* = nullptr> - OutT DurationProduct(LhsT x, RhsT y) const + [[nodiscard]] OutT DurationProduct(LhsT x, RhsT y) const { return x * y; } @@ -112,7 +112,7 @@ struct Div { std::enable_if_t::value, void>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) / static_cast(rhs)); } @@ -128,7 +128,7 @@ struct Div { typename LhsT, typename RhsT, std::enable_if_t<(std::is_integral_v || cudf::is_duration()), void>* = nullptr> - OutT DurationDivide(LhsT x, RhsT y) const + [[nodiscard]] OutT DurationDivide(LhsT x, RhsT y) const { return x / y; } @@ -191,33 +191,31 @@ struct FloorDiv { template struct Mod { - template ::type>)>* = nullptr> + template >)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) % static_cast(rhs)); } - template ::type, float>)>* = nullptr> + template < + typename OutT = TypeOut, + typename LhsT = TypeLhs, + typename RhsT = TypeRhs, + std::enable_if_t<(std::is_same_v, float>)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { return static_cast(fmod(static_cast(lhs), static_cast(rhs))); } template < - typename OutT = TypeOut, - typename LhsT = TypeLhs, - typename RhsT = TypeRhs, - std::enable_if_t<(std::is_same_v::type, double>)>* = - nullptr> + typename OutT = TypeOut, + typename LhsT = TypeLhs, + typename RhsT = TypeRhs, + std::enable_if_t<(std::is_same_v, double>)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { return static_cast(fmod(static_cast(lhs), static_cast(rhs))); @@ -326,7 +324,7 @@ struct LogBase { template struct PMod { - using CommonArgsT = typename std::common_type::type; + using CommonArgsT = std::common_type_t; TypeOut operator()(TypeLhs x, TypeRhs y) const { @@ -351,8 +349,8 @@ struct PyMod { TypeOut operator()(TypeLhs x, TypeRhs y) const { if constexpr (std::is_floating_point_v or std::is_floating_point_v) { - double x1 = static_cast(x); - double y1 = static_cast(y); + auto x1 = static_cast(x); + auto y1 = static_cast(y); return fmod(fmod(x1, y1) + y1, y1); } else { return ((x % y) + y) % y; diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index fe221fb1c48..799bf646e52 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp index ab230ab036e..5570a7d498c 100644 --- a/cpp/tests/column/bit_cast_test.cpp +++ b/cpp/tests/column/bit_cast_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -26,8 +25,6 @@ #include -#include - template struct rep_type_impl { using type = void; diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp index 14b4197de71..d700adaebd5 100644 --- a/cpp/tests/column/column_test.cpp +++ b/cpp/tests/column/column_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -340,7 +339,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask) cudf::column moved_to{std::move(original)}; - EXPECT_EQ(0, original.size()); + EXPECT_EQ(0, original.size()); // NOLINT EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type()); verify_column_views(moved_to); @@ -359,7 +358,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask) cudf::column moved_to{std::move(original)}; verify_column_views(moved_to); - EXPECT_EQ(0, original.size()); + EXPECT_EQ(0, original.size()); // NOLINT EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type()); // Verify move diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp index 6de9121158b..470437f4112 100644 --- a/cpp/tests/column/column_view_device_span_test.cpp +++ b/cpp/tests/column/column_view_device_span_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index 37ab4b8f387..ad344476332 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -15,9 +15,7 @@ */ #include -#include #include -#include #include #include diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 603187f0330..aa9d508b6aa 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -26,11 +26,8 @@ #include #include #include -#include #include -#include - #include class ColumnFactoryTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 18140c34abd..aedc498964a 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -34,8 +34,6 @@ #include #include -#include - #include #include #include diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp index cfbd181f944..e1cdfe9beed 100644 --- a/cpp/tests/copying/copy_if_else_nested_tests.cpp +++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp index 25d93da277b..e2133a546e4 100644 --- a/cpp/tests/copying/copy_range_tests.cpp +++ b/cpp/tests/copying/copy_range_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 7c8729b6a77..9c00725d5d2 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -73,44 +72,45 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong) using T = TypeParam; // make sure we span at least 2 warps - int num_els = 64; - - bool mask[] = {true, false, true, false, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, false, false, false, false, true, true, true, - true, true, true, true, true, true, false, false, false, false, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); - - bool lhs_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - lhs_v); - - bool rhs_v[] = {true, true, true, true, true, true, false, false, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, - rhs_v); - - bool exp_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, - 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - exp_v); + constexpr int num_els = 64; + + std::array mask{ + true, false, true, false, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); + + wrapper lhs_w( + {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + wrapper rhs_w( + {6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, + {true, true, true, true, true, true, false, false, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + wrapper expected_w( + {5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, + 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -318,19 +318,17 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); cudf::numeric_scalar lhs_w(5); auto const rhs = cudf::test::make_type_param_vector({6, 6, 6, 6}); - bool rhs_v[] = {true, false, true, true}; - wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v); + std::array rhs_v{true, false, true, true}; + wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v.begin()); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); - wrapper expected_w(expected.begin(), expected.end(), rhs_v); + wrapper expected_w(expected.begin(), expected.end(), rhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -340,20 +338,18 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - bool mask_v[] = {true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els, mask_v); + std::array mask{true, false, false, true}; + std::array mask_v{true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto const lhs = cudf::test::make_type_param_vector({5, 5, 5, 5}); - bool lhs_v[] = {false, true, true, true}; - wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v); + std::array lhs_v{false, true, true, true}; + wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v.begin()); cudf::numeric_scalar rhs_w(6); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 6}); - wrapper expected_w(expected.begin(), expected.end(), lhs_v); + wrapper expected_w(expected.begin(), expected.end(), lhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -363,16 +359,14 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); cudf::numeric_scalar lhs_w(5); cudf::numeric_scalar rhs_w(6, false); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); - wrapper expected_w(expected.begin(), expected.end(), mask); + wrapper expected_w(expected.begin(), expected.end(), mask.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -405,17 +399,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); - bool rhs_v[] = {true, false, true, true}; - wrapper rhs_w({6, 6, 6, 6}, rhs_v); + std::array rhs_v{true, false, true, true}; + wrapper rhs_w({6, 6, 6, 6}, rhs_v.begin()); - wrapper expected_w({5, 6, 6, 5}, rhs_v); + wrapper expected_w({5, 6, 6, 5}, rhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -425,17 +417,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestColumnScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); - bool lhs_v[] = {false, true, true, true}; - wrapper lhs_w({5, 5, 5, 5}, lhs_v); + std::array lhs_v{false, true, true, true}; + wrapper lhs_w({5, 5, 5, 5}, lhs_v.begin()); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), true); - wrapper expected_w({5, 6, 6, 5}, lhs_v); + wrapper expected_w({5, 6, 6, 5}, lhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -445,15 +435,13 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), false); - wrapper expected_w({5, 6, 6, 5}, mask); + wrapper expected_w({5, 6, 6, 5}, mask.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -483,9 +471,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, true, false, true, false, true}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, true, false, true, false, true}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -510,9 +498,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, false, true, false, true, false}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, false, true, false, true, false}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -538,8 +526,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {false, true, true, true, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + std::array mask{false, true, true, true, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(strings2, strings1, mask_w); @@ -565,9 +553,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar) std::vector h_string2{"aaa"}; cudf::string_scalar string2{h_string2[0], false}; - constexpr cudf::size_type mask_size = 6; - bool mask[] = {true, false, true, false, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + mask_size); + std::array mask{true, false, true, false, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(string1, string2, mask_w); @@ -652,9 +639,9 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn) cudf::test::dictionary_column_wrapper input2( h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, true, false, true, false, true}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, true, false, true, false, true}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(input1, input2, mask_w); auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); @@ -679,8 +666,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar) cudf::test::dictionary_column_wrapper input2( h_strings.begin(), h_strings.end(), valids); - bool mask[] = {false, true, true, true, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + std::array mask{false, true, true, true, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(input2, input1, mask_w); auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); diff --git a/cpp/tests/copying/gather_list_tests.cpp b/cpp/tests/copying/gather_list_tests.cpp index 247090aac90..93f71345c5c 100644 --- a/cpp/tests/copying/gather_list_tests.cpp +++ b/cpp/tests/copying/gather_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include #include diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp index 28098878086..795e3f30aa1 100644 --- a/cpp/tests/copying/gather_str_tests.cpp +++ b/cpp/tests/copying/gather_str_tests.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp index 1598ab2646a..b2c0f7acc3a 100644 --- a/cpp/tests/copying/gather_struct_tests.cpp +++ b/cpp/tests/copying/gather_struct_tests.cpp @@ -17,20 +17,15 @@ #include #include #include -#include #include #include #include #include #include -#include -#include -#include #include #include #include -#include #include diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp index 07ce672b14d..908dcd67673 100644 --- a/cpp/tests/copying/gather_tests.cpp +++ b/cpp/tests/copying/gather_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp index 90ff97e7355..b2d64dac7c8 100644 --- a/cpp/tests/copying/get_value_tests.cpp +++ b/cpp/tests/copying/get_value_tests.cpp @@ -16,10 +16,8 @@ #include #include -#include #include #include -#include #include #include diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp index 4f28ff12941..1f76efdc4c3 100644 --- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp +++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp @@ -16,13 +16,10 @@ #include #include #include -#include #include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/copying/reverse_tests.cpp b/cpp/tests/copying/reverse_tests.cpp index e4b2d319ddf..46516436901 100644 --- a/cpp/tests/copying/reverse_tests.cpp +++ b/cpp/tests/copying/reverse_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,17 +17,13 @@ #include #include #include -#include #include #include #include -#include -#include #include #include -#include #include #include #include diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp index 2f76e3f1fcd..8be5d8c1fbb 100644 --- a/cpp/tests/copying/sample_tests.cpp +++ b/cpp/tests/copying/sample_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,12 +15,9 @@ */ #include -#include #include -#include #include -#include #include #include #include diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp index 42d2e004d6b..23faa6e5b86 100644 --- a/cpp/tests/copying/scatter_list_scalar_tests.cpp +++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include using mask_vector = std::vector; using size_column = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp index a82860a3eec..1f87fcfcc99 100644 --- a/cpp/tests/copying/scatter_list_tests.cpp +++ b/cpp/tests/copying/scatter_list_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp index 78572b0bb37..1d1da8a1b1e 100644 --- a/cpp/tests/copying/scatter_struct_scalar_tests.cpp +++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp index c92244d047b..7d88e9af85f 100644 --- a/cpp/tests/copying/scatter_struct_tests.cpp +++ b/cpp/tests/copying/scatter_struct_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include using namespace cudf::test::iterators; diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp index 41a753cd0ac..74c04446bdd 100644 --- a/cpp/tests/copying/scatter_tests.cpp +++ b/cpp/tests/copying/scatter_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -23,7 +22,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp index 8881fb344a2..a133ae43872 100644 --- a/cpp/tests/copying/segmented_gather_list_tests.cpp +++ b/cpp/tests/copying/segmented_gather_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index ff6808d9a79..72a8e7357bc 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -30,7 +29,6 @@ #include #include -#include using TestTypes = cudf::test::Types; diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp index bebd3d25610..3868a147fa8 100644 --- a/cpp/tests/copying/slice_tests.cpp +++ b/cpp/tests/copying/slice_tests.cpp @@ -22,13 +22,10 @@ #include #include -#include #include #include -#include -#include -#include +#include #include #include #include @@ -370,11 +367,12 @@ TEST_F(SliceStringTableTest, StringWithNulls) auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); diff --git a/cpp/tests/copying/slice_tests.cuh b/cpp/tests/copying/slice_tests.cuh index a180740f143..1e037294527 100644 --- a/cpp/tests/copying/slice_tests.cuh +++ b/cpp/tests/copying/slice_tests.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -148,7 +148,7 @@ std::vector create_expected_tables(cudf::size_type num_cols, } } - result.push_back(cudf::table(std::move(cols))); + result.emplace_back(std::move(cols)); } return result; @@ -163,13 +163,12 @@ inline std::vector create_expected_string_co for (unsigned long index = 0; index < indices.size(); index += 2) { if (not nullable) { - result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index], - strings.begin() + indices[index + 1])); + result.emplace_back(strings.begin() + indices[index], strings.begin() + indices[index + 1]); } else { auto valids = cudf::detail::make_counting_transform_iterator( indices[index], [](auto i) { return i % 2 == 0; }); - result.push_back(cudf::test::strings_column_wrapper( - strings.begin() + indices[index], strings.begin() + indices[index + 1], valids)); + result.emplace_back( + strings.begin() + indices[index], strings.begin() + indices[index + 1], valids); } } @@ -184,16 +183,16 @@ inline std::vector create_expected_string_co std::vector result = {}; for (unsigned long index = 0; index < indices.size(); index += 2) { - result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index], - strings.begin() + indices[index + 1], - validity.begin() + indices[index])); + result.emplace_back(strings.begin() + indices[index], + strings.begin() + indices[index + 1], + validity.begin() + indices[index]); } return result; } inline std::vector create_expected_string_tables( - std::vector const strings[2], + std::vector> const strings, std::vector const& indices, bool nullable) { @@ -216,7 +215,7 @@ inline std::vector create_expected_string_tables( } } - result.push_back(cudf::table(std::move(cols))); + result.emplace_back(std::move(cols)); } return result; diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index ee3e7da5e0f..b56b0f2d3f8 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -135,7 +136,7 @@ std::vector create_expected_tables_for_splits( } std::vector create_expected_string_tables_for_splits( - std::vector const strings[2], + std::vector> const strings, std::vector const& splits, bool nullable) { @@ -144,8 +145,8 @@ std::vector create_expected_string_tables_for_splits( } std::vector create_expected_string_tables_for_splits( - std::vector const strings[2], - std::vector const validity[2], + std::vector> const strings, + std::vector> const validity, std::vector const& splits) { std::vector indices = splits_to_indices(splits, strings[0].size()); @@ -627,11 +628,12 @@ void split_string_with_invalids(SplitFunc Split, auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); @@ -658,11 +660,12 @@ void split_empty_output_strings_column_value(SplitFunc Split, auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); @@ -684,9 +687,9 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare) auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; std::vector splits{2, 5, 9}; @@ -699,16 +702,17 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare) EXPECT_NO_THROW(Split(empty_table, splits)); } - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), no_valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), no_valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); scols.push_back(sw[1].release()); cudf::table src_table(std::move(scols)); auto result = Split(src_table, splits); - std::vector validity_masks[2] = {std::vector(strings[0].size()), - std::vector(strings[0].size())}; + std::vector> validity_masks{std::vector(strings[0].size()), + std::vector(strings[0].size())}; std::generate( validity_masks[1].begin(), validity_masks[1].end(), [i = 0]() mutable { return i++ % 2 == 0; }); @@ -1913,9 +1917,9 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes) cudf::size_type start = 0; auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; std::vector> cols; @@ -2377,7 +2381,7 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall) { // internally, contiguous split chunks GPU work in 1MB contiguous copies // so the output buffer must be 1MB or larger. - EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error); + EXPECT_THROW(auto _ = cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error); } TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall) diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp index 0905f9babdc..90457f8d74c 100644 --- a/cpp/tests/copying/utility_tests.cpp +++ b/cpp/tests/copying/utility_tests.cpp @@ -23,7 +23,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 13577c4d0ea..1d1deb42a51 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -23,14 +23,11 @@ #include #include -#include #include #include #include #include -#include - #define XXX false // stub for null values constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; @@ -55,16 +52,26 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp) cudf::data_type dtype{cudf::type_to_id()}; cudf::column col{dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; - EXPECT_THROW(extract_year(col), cudf::logic_error); - EXPECT_THROW(extract_month(col), cudf::logic_error); - EXPECT_THROW(extract_day(col), cudf::logic_error); - EXPECT_THROW(extract_weekday(col), cudf::logic_error); - EXPECT_THROW(extract_hour(col), cudf::logic_error); - EXPECT_THROW(extract_minute(col), cudf::logic_error); - EXPECT_THROW(extract_second(col), cudf::logic_error); - EXPECT_THROW(extract_millisecond_fraction(col), cudf::logic_error); - EXPECT_THROW(extract_microsecond_fraction(col), cudf::logic_error); - EXPECT_THROW(extract_nanosecond_fraction(col), cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::YEAR), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::MONTH), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::DAY), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::WEEKDAY), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::HOUR), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::MINUTE), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::SECOND), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::MILLISECOND), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::MICROSECOND), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::NANOSECOND), + cudf::logic_error); EXPECT_THROW(last_day_of_month(col), cudf::logic_error); EXPECT_THROW(day_of_year(col), cudf::logic_error); EXPECT_THROW(add_calendrical_months(col, *cudf::make_empty_column(cudf::type_id::INT16)), @@ -107,95 +114,135 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) 987234623 // 1970-01-01 00:00:00.987234623 GMT }; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_D), - fixed_width_column_wrapper{1965, 2018, 2023}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_s), - fixed_width_column_wrapper{1965, 2018, 2023}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ms), - fixed_width_column_wrapper{1965, 2018, 2023}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ns), - fixed_width_column_wrapper{1969, 1970, 1970}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_D), - fixed_width_column_wrapper{10, 7, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_s), - fixed_width_column_wrapper{10, 7, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ms), - fixed_width_column_wrapper{10, 7, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ns), - fixed_width_column_wrapper{12, 1, 1}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_D), - fixed_width_column_wrapper{26, 4, 25}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_s), - fixed_width_column_wrapper{26, 4, 25}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ms), - fixed_width_column_wrapper{26, 4, 25}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ns), - fixed_width_column_wrapper{31, 1, 1}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_D), - fixed_width_column_wrapper{2, 3, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_s), - fixed_width_column_wrapper{2, 3, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms), - fixed_width_column_wrapper{2, 3, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms), - fixed_width_column_wrapper{2, 3, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_s), - fixed_width_column_wrapper{14, 12, 7}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ms), - fixed_width_column_wrapper{14, 12, 7}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ns), - fixed_width_column_wrapper{23, 0, 0}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_s), - fixed_width_column_wrapper{1, 0, 32}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ms), - fixed_width_column_wrapper{1, 0, 32}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), - fixed_width_column_wrapper{59, 0, 0}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_s), - fixed_width_column_wrapper{12, 0, 12}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_ms), - fixed_width_column_wrapper{12, 0, 12}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), - fixed_width_column_wrapper{59, 0, 0}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_s), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ms), - fixed_width_column_wrapper{762, 0, 929}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ns), - fixed_width_column_wrapper{976, 23, 987}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_s), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ms), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ns), - fixed_width_column_wrapper{675, 432, 234}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_s), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ms), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns), - fixed_width_column_wrapper{766, 424, 623}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1969, 1970, 1970}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{12, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{31, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{23, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{762, 0, 929}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{976, 23, 987}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{675, 432, 234}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{766, 424, 623}); } template @@ -219,16 +266,29 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns) cudf::column int16s{int16s_dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; cudf::column timestamps{timestamps_dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::YEAR), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MONTH), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::DAY), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::WEEKDAY), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::HOUR), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MINUTE), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::SECOND), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MILLISECOND), + int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MICROSECOND), + int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::NANOSECOND), + int16s); } TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents) @@ -258,13 +318,27 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents) expected_seconds = fixed_width_column_wrapper{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; } - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), expected_years); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), expected_months); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps), expected_days); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps), expected_weekdays); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), expected_hours); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), expected_minutes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), expected_seconds); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::YEAR), + expected_years); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MONTH), + expected_months); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::DAY), + expected_days); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::WEEKDAY), + expected_weekdays); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::HOUR), + expected_hours); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MINUTE), + expected_minutes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::SECOND), + expected_seconds); } TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedNullableDatetimeComponents) @@ -314,13 +388,27 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedNullableDatetimeComponen {true, false, true, false, true, false, true, false, true, false}}; } - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), expected_years); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), expected_months); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps), expected_days); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps), expected_weekdays); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), expected_hours); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), expected_minutes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), expected_seconds); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::YEAR), + expected_years); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MONTH), + expected_months); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::DAY), + expected_days); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::WEEKDAY), + expected_weekdays); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::HOUR), + expected_hours); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MINUTE), + expected_minutes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::SECOND), + expected_seconds); } TEST_F(BasicDatetimeOpsTest, TestLastDayOfMonthWithSeconds) diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp index 46bf5468922..ebc8c11e86c 100644 --- a/cpp/tests/dictionary/add_keys_test.cpp +++ b/cpp/tests/dictionary/add_keys_test.cpp @@ -24,8 +24,6 @@ #include #include -#include - struct DictionaryAddKeysTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryAddKeysTest, StringsColumn) diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp index 5db0e9fa1e4..dfa3ede5d46 100644 --- a/cpp/tests/dictionary/encode_test.cpp +++ b/cpp/tests/dictionary/encode_test.cpp @@ -21,8 +21,6 @@ #include #include -#include - struct DictionaryEncodeTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryEncodeTest, EncodeStringColumn) diff --git a/cpp/tests/dictionary/fill_test.cpp b/cpp/tests/dictionary/fill_test.cpp index 18696b66e48..bc7d19201aa 100644 --- a/cpp/tests/dictionary/fill_test.cpp +++ b/cpp/tests/dictionary/fill_test.cpp @@ -18,13 +18,10 @@ #include #include -#include #include #include #include -#include - struct DictionaryFillTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryFillTest, StringsColumn) diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp index 25501b4fde7..2774173b80a 100644 --- a/cpp/tests/dictionary/search_test.cpp +++ b/cpp/tests/dictionary/search_test.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/tests/dictionary/slice_test.cpp b/cpp/tests/dictionary/slice_test.cpp index d80f8dee079..8c15d6dbecd 100644 --- a/cpp/tests/dictionary/slice_test.cpp +++ b/cpp/tests/dictionary/slice_test.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp index 26badefe698..a5e2db6a005 100644 --- a/cpp/tests/filling/fill_tests.cpp +++ b/cpp/tests/filling/fill_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp index 6326765c68b..c856984a4a3 100644 --- a/cpp/tests/filling/repeat_tests.cpp +++ b/cpp/tests/filling/repeat_tests.cpp @@ -17,14 +17,11 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include @@ -33,7 +30,6 @@ #include #include -#include constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp index 5651a26f192..53782c90c26 100644 --- a/cpp/tests/filling/sequence_tests.cpp +++ b/cpp/tests/filling/sequence_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -41,8 +40,7 @@ TYPED_TEST(SequenceTypedTestFixture, Incrementing) cudf::size_type num_els = 10; - T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto result = cudf::sequence(num_els, init, step); @@ -58,8 +56,8 @@ TYPED_TEST(SequenceTypedTestFixture, Decrementing) cudf::size_type num_els = 10; - T expected[] = {0, -5, -10, -15, -20, -25, -30, -35, -40, -45}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w( + {0, -5, -10, -15, -20, -25, -30, -35, -40, -45}); auto result = cudf::sequence(num_els, init, step); @@ -75,8 +73,7 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput) cudf::size_type num_els = 0; - T expected[] = {}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({}); auto result = cudf::sequence(num_els, init, step); @@ -121,8 +118,7 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep) cudf::size_type num_els = 10; - T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto result = cudf::sequence(num_els, init); diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index a222289216d..b96c6909e55 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -18,17 +18,14 @@ #include #include #include -#include #include #include -#include #include #include #include #include -#include #include using namespace numeric; diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 749f4013013..ba456084a7c 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,8 +20,6 @@ #include #include -#include - template struct groupby_collect_list_test : public cudf::test::BaseFixture {}; @@ -127,8 +125,9 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion) using LCW = cudf::test::lists_column_wrapper; cudf::test::fixed_width_column_wrapper keys{1, 1, 2, 2, 3, 3, 4, 4}; - bool const validity_mask[] = {true, false, false, true, true, true, false, false}; - LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, validity_mask}; + std::array const validity_mask{true, false, false, true, true, true, false, false}; + LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, + validity_mask.data()}; cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp index 61d2838590b..dfd7eb82c4a 100644 --- a/cpp/tests/groupby/collect_set_tests.cpp +++ b/cpp/tests/groupby/collect_set_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp index 26f714632dd..f8cc813e877 100644 --- a/cpp/tests/groupby/correlation_tests.cpp +++ b/cpp/tests/groupby/correlation_tests.cpp @@ -25,7 +25,6 @@ #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp index e3eb2da201f..81378bb91e8 100644 --- a/cpp/tests/groupby/covariance_tests.cpp +++ b/cpp/tests/groupby/covariance_tests.cpp @@ -23,10 +23,8 @@ #include #include -#include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp index 5d99d15ae77..df0375d6a09 100644 --- a/cpp/tests/groupby/groupby_test_util.cpp +++ b/cpp/tests/groupby/groupby_test_util.cpp @@ -17,8 +17,8 @@ #include "groupby_test_util.hpp" #include -#include #include +#include #include #include @@ -27,9 +27,6 @@ #include #include #include -#include - -#include void test_single_agg(cudf::column_view const& keys, cudf::column_view const& values, diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp index 755b0c20f17..9d2e613be3e 100644 --- a/cpp/tests/groupby/groupby_test_util.hpp +++ b/cpp/tests/groupby/groupby_test_util.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,8 @@ #pragma once -#include #include -#include #include -#include enum class force_use_sort_impl : bool { NO, YES }; diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp index 2d447025919..783cfb17e49 100644 --- a/cpp/tests/groupby/histogram_tests.cpp +++ b/cpp/tests/groupby/histogram_tests.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp index d86de798844..6195e0179ec 100644 --- a/cpp/tests/groupby/max_scan_tests.cpp +++ b/cpp/tests/groupby/max_scan_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp index 0cb5ee30a8b..e9c72293649 100644 --- a/cpp/tests/groupby/mean_tests.cpp +++ b/cpp/tests/groupby/mean_tests.cpp @@ -49,7 +49,7 @@ TYPED_TEST(groupby_mean_test, basic) { using V = TypeParam; using R = cudf::detail::target_type_t; - using RT = typename std::conditional(), int, double>::type; + using RT = std::conditional_t(), int, double>; cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; cudf::test::fixed_width_column_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -114,7 +114,7 @@ TYPED_TEST(groupby_mean_test, null_keys_and_values) { using V = TypeParam; using R = cudf::detail::target_type_t; - using RT = typename std::conditional(), int, double>::type; + using RT = std::conditional_t(), int, double>; cudf::test::fixed_width_column_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp index 279d71560b4..4481e2dc022 100644 --- a/cpp/tests/groupby/merge_lists_tests.cpp +++ b/cpp/tests/groupby/merge_lists_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp index 9736bb84dd6..1bfba265478 100644 --- a/cpp/tests/groupby/merge_sets_tests.cpp +++ b/cpp/tests/groupby/merge_sets_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp index 7f31bc9089f..f2a50248b4a 100644 --- a/cpp/tests/groupby/rank_scan_tests.cpp +++ b/cpp/tests/groupby/rank_scan_tests.cpp @@ -22,8 +22,6 @@ #include #include -#include - using namespace cudf::test::iterators; template diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp index 14c9ceb4508..49f9d7cb10a 100644 --- a/cpp/tests/groupby/shift_tests.cpp +++ b/cpp/tests/groupby/shift_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include template diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index baa59026b07..4ae5d06b214 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups) cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0}; int const delta = 1000; - auto a = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto b = cudf::type_dispatcher( static_cast(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta); - auto c = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto d = cudf::type_dispatcher( static_cast(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta); - auto e = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto e = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); @@ -507,3 +507,126 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } + +std::unique_ptr do_agg( + cudf::column_view key, + cudf::column_view val, + std::function()> make_agg) +{ + std::vector keys; + keys.push_back(key); + cudf::table_view const key_table(keys); + + cudf::groupby::groupby gb(key_table); + std::vector requests; + cudf::groupby::aggregation_request req; + req.values = val; + req.aggregations.push_back(make_agg()); + requests.push_back(std::move(req)); + + auto result = gb.aggregate(std::move(requests)); + + std::vector> result_columns; + for (auto&& c : result.first->release()) { + result_columns.push_back(std::move(c)); + } + + EXPECT_EQ(result.second.size(), 1); + EXPECT_EQ(result.second[0].results.size(), 1); + result_columns.push_back(std::move(result.second[0].results[0])); + + return std::make_unique(std::move(result_columns)); +} + +TEST_F(TDigestMergeTest, AllValuesAreNull) +{ + // The input must be sorted by the key. + // See `aggregate_result_functor::operator()` for details. + auto const keys = cudf::test::fixed_width_column_wrapper{{0, 0, 1, 1, 2}}; + auto const keys_view = cudf::column_view(keys); + auto val_elems = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + // All values are null + return false; + }); + auto const vals = cudf::test::fixed_width_column_wrapper{ + val_elems, val_elems + keys_view.size(), val_valids}; + + auto const delta = 1000; + + // Compute tdigest. The result should have 3 empty clusters, one per group. + auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() { + return cudf::make_tdigest_aggregation(delta); + }); + + auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; + cudf::column_view const expected_computed_keys_view{expected_computed_keys}; + auto const expected_computed_vals = + cudf::tdigest::detail::make_empty_tdigests_column(expected_computed_keys_view.size(), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view()); + // The computed values are nullable even though the input values are not. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(), + compute_result->get_column(1).view()); + + // Merge tdigest. The result should have 3 empty clusters, one per group. + auto const merge_result = + do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { + return cudf::make_merge_tdigest_aggregation(delta); + }); + + auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; + cudf::column_view const expected_merged_keys_view{expected_merged_keys}; + auto const expected_merged_vals = + cudf::tdigest::detail::make_empty_tdigests_column(expected_merged_keys_view.size(), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view()); +} + +TEST_F(TDigestMergeTest, AllValuesInOneGroupIsNull) +{ + cudf::test::fixed_width_column_wrapper keys{0, 1, 2, 2, 3}; + cudf::test::fixed_width_column_wrapper vals{{10.0, 20.0, {}, {}, 30.0}, + {true, true, false, false, true}}; + + auto const delta = 1000; + + // Compute tdigest. The result should have 3 empty clusters, one per group. + auto const compute_result = do_agg(cudf::column_view(keys), cudf::column_view(vals), [&delta]() { + return cudf::make_tdigest_aggregation(delta); + }); + + auto const expected_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2, 3}}; + + cudf::test::fixed_width_column_wrapper expected_means{10, 20, 30}; + cudf::test::fixed_width_column_wrapper expected_weights{1, 1, 1}; + cudf::test::fixed_width_column_wrapper expected_offsets{0, 1, 2, 2, 3}; + cudf::test::fixed_width_column_wrapper expected_mins{10.0, 20.0, 0.0, 30.0}; + cudf::test::fixed_width_column_wrapper expected_maxes{10.0, 20.0, 0.0, 30.0}; + auto const expected_values = + cudf::tdigest::detail::make_tdigest_column(4, + std::make_unique(expected_means), + std::make_unique(expected_weights), + std::make_unique(expected_offsets), + std::make_unique(expected_mins), + std::make_unique(expected_maxes), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys}, + compute_result->get_column(0).view()); + // The computed values are nullable even though the input values are not. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), compute_result->get_column(1).view()); + + // Merge tdigest. The result should have 3 empty clusters, one per group. + auto const merge_result = + do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { + return cudf::make_merge_tdigest_aggregation(delta); + }); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys}, + merge_result->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), merge_result->get_column(1).view()); +} diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp index 69e518cbf8d..b54adb52496 100644 --- a/cpp/tests/hashing/md5_test.cpp +++ b/cpp/tests/hashing/md5_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp index 4fb8f78b558..0e68050f935 100644 --- a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp +++ b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,8 +22,6 @@ #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - using NumericTypesNoBools = cudf::test::Concat; diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp index c1a6e6ff6e1..b4622f5eb81 100644 --- a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp +++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp @@ -17,11 +17,9 @@ #include #include #include -#include #include #include -#include #include constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index e28e71442a6..3aa0bda6ae8 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -137,7 +136,7 @@ TEST_F(SHA1HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error); } TEST_F(SHA1HashTest, StructsUnsupported) @@ -146,7 +145,7 @@ TEST_F(SHA1HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index 61b584f94df..3f6aeb9d5e6 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -137,7 +136,7 @@ TEST_F(SHA224HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error); } TEST_F(SHA224HashTest, StructsUnsupported) @@ -146,7 +145,7 @@ TEST_F(SHA224HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index cc95c7a2f0f..9519e96fbae 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -17,14 +17,11 @@ #include #include #include -#include #include #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - class SHA256HashTest : public cudf::test::BaseFixture {}; TEST_F(SHA256HashTest, EmptyTable) @@ -138,7 +135,7 @@ TEST_F(SHA256HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error); } TEST_F(SHA256HashTest, StructsUnsupported) @@ -147,7 +144,7 @@ TEST_F(SHA256HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index 4c79934f98d..9de566b9d9b 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -155,7 +154,7 @@ TEST_F(SHA384HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error); } TEST_F(SHA384HashTest, StructsUnsupported) @@ -164,7 +163,7 @@ TEST_F(SHA384HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 0eb1c60b8fc..95e5245f38e 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -155,7 +154,7 @@ TEST_F(SHA512HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error); } TEST_F(SHA512HashTest, StructsUnsupported) @@ -164,7 +163,7 @@ TEST_F(SHA512HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/xxhash_64_test.cpp b/cpp/tests/hashing/xxhash_64_test.cpp index ab4ed829681..d8694a72d94 100644 --- a/cpp/tests/hashing/xxhash_64_test.cpp +++ b/cpp/tests/hashing/xxhash_64_test.cpp @@ -17,11 +17,8 @@ #include #include #include -#include #include -#include -#include #include using NumericTypesNoBools = diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index 330f07ac8e2..ef4b9dd9b8a 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -225,8 +225,8 @@ TEST_F(DLPackUntypedTests, UnsupportedBroadcast1DTensorFromDlpack) constexpr int ndim = 1; // Broadcasted (stride-0) 1D tensor auto const data = cudf::test::make_type_param_vector({1}); - int64_t shape[ndim] = {5}; - int64_t strides[ndim] = {0}; + int64_t shape[ndim] = {5}; // NOLINT + int64_t strides[ndim] = {0}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -248,8 +248,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStrided1DTensorFromDlpack) constexpr int ndim = 1; // Strided 1D tensor auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2}; - int64_t strides[ndim] = {2}; + int64_t shape[ndim] = {2}; // NOLINT + int64_t strides[ndim] = {2}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -271,7 +271,7 @@ TEST_F(DLPackUntypedTests, UnsupportedImplicitRowMajor2DTensorFromDlpack) constexpr int ndim = 2; // Row major 2D tensor auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2, 2}; + int64_t shape[ndim] = {2, 2}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -293,8 +293,8 @@ TEST_F(DLPackUntypedTests, UnsupportedExplicitRowMajor2DTensorFromDlpack) constexpr int ndim = 2; // Row major 2D tensor with explicit strides auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2, 2}; - int64_t strides[ndim] = {2, 1}; + int64_t shape[ndim] = {2, 2}; // NOLINT + int64_t strides[ndim] = {2, 1}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -316,8 +316,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStridedColMajor2DTensorFromDlpack) constexpr int ndim = 2; // Column major, but strided in fastest dimension auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4, 5, 6, 7, 8}); - int64_t shape[ndim] = {2, 2}; - int64_t strides[ndim] = {2, 4}; + int64_t shape[ndim] = {2, 2}; // NOLINT + int64_t strides[ndim] = {2, 4}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -465,8 +465,8 @@ TYPED_TEST(DLPackNumericTests, FromDlpackCpu) using T = TypeParam; auto const data = cudf::test::make_type_param_vector({0, 1, 2, 3, 4, 0, 5, 6, 7, 8, 0}); uint64_t const offset{sizeof(T)}; - int64_t shape[2] = {4, 2}; - int64_t strides[2] = {1, 5}; + int64_t shape[2] = {4, 2}; // NOLINT + int64_t strides[2] = {1, 5}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index a4dc7531765..1ddc33e749a 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -17,17 +17,13 @@ #include "nanoarrow_utils.hpp" #include -#include #include #include -#include #include #include #include #include -#include -#include #include #include #include @@ -270,9 +266,9 @@ TEST_F(FromArrowDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); @@ -414,9 +410,9 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType) { std::vector> columns; auto col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1}); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); cudf::table expected_table(std::move(columns)); cudf::table_view expected_table_view = expected_table.view(); diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index cbfa4911c3c..d93ef28aab8 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -28,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -309,9 +307,9 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp index 80a2e4b2ffd..3916025bf22 100644 --- a/cpp/tests/interop/from_arrow_stream_test.cpp +++ b/cpp/tests/interop/from_arrow_stream_test.cpp @@ -17,27 +17,14 @@ #include "nanoarrow_utils.hpp" #include -#include -#include #include -#include -#include -#include -#include #include -#include -#include -#include -#include #include #include #include -#include #include -#include - struct VectorOfArrays { std::vector arrays; nanoarrow::UniqueSchema schema; diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 81c406c0faf..18efae75cb1 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -25,9 +25,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -37,8 +35,6 @@ #include #include -#include -#include std::unique_ptr get_cudf_table() { @@ -52,7 +48,7 @@ std::unique_ptr get_cudf_table() .release()); auto col4 = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {true, false, true, true, true}); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( {true, false, true, false, true}, {true, false, true, true, false}) .release()); @@ -339,9 +335,9 @@ TEST_F(FromArrowTest, DictionaryIndicesType) std::vector> columns; auto col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {true, false, true, true, true}); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); cudf::table expected_table(std::move(columns)); diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index a961f73d955..8be7e087b6d 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -256,7 +256,8 @@ std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_ ArrowBitmap out; ArrowBitmapInit(&out); NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1)); - std::memset(out.buffer.data, 0, out.buffer.size_bytes); + // TODO: Investigate clang-tidy issue further after nanoarrow is made compliant + std::memset(out.buffer.data, 0, out.buffer.size_bytes); // NOLINT for (size_t i = 0; i < b.size(); ++i) { ArrowBitSetTo(out.buffer.data, i, static_cast(b[i])); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 51216a8512c..29aa928c277 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -17,21 +17,15 @@ #include "nanoarrow_utils.hpp" #include -#include #include -#include -#include #include #include -#include -#include #include #include #include #include #include -#include #include #include @@ -55,7 +49,7 @@ get_nanoarrow_cudf_table(cudf::size_type length) auto col4 = cudf::test::fixed_width_column_wrapper( test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), test_data.bool_data.end(), test_data.bool_validity.begin()) @@ -82,8 +76,8 @@ get_nanoarrow_cudf_table(cudf::size_type length) test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin()) .release(); vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); + cols.push_back(std::move(int_column)); + cols.push_back(std::move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back( @@ -575,9 +569,9 @@ TEST_F(ToArrowDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp index fc0ed6c9352..fa3aa82fee2 100644 --- a/cpp/tests/interop/to_arrow_host_test.cpp +++ b/cpp/tests/interop/to_arrow_host_test.cpp @@ -17,20 +17,14 @@ #include "nanoarrow_utils.hpp" #include -#include #include -#include -#include #include #include #include #include -#include #include #include -#include -#include #include #include #include @@ -436,9 +430,9 @@ TEST_F(ToArrowHostDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 90ae12cdd90..86295d8efb1 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -19,14 +19,12 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include @@ -90,7 +88,7 @@ std::pair, std::shared_ptr> get_table auto col4 = cudf::test::fixed_width_column_wrapper( int64_data.begin(), int64_data.end(), validity.begin()); auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( bool_data.begin(), bool_data.end(), bool_validity.begin()) .release()); @@ -112,8 +110,8 @@ std::pair, std::shared_ptr> get_table cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) .release(); vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); + cols.push_back(std::move(int_column)); + cols.push_back(std::move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( bool_data_validity.begin(), bool_data_validity.end())); columns.emplace_back( @@ -294,9 +292,9 @@ TEST_F(ToArrowTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); @@ -438,7 +436,7 @@ TEST_F(ToArrowTest, FixedPoint64TableLarge) auto const schema = std::make_shared(schema_vector); auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); - std::vector const metadata = {{"a"}}; + std::vector const metadata = {{"a"}}; // NOLINT ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table)); } } diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index 840cf263ed9..54262dc3b44 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -39,19 +39,19 @@ using cudf::device_span; */ template struct DecompressTest : public cudf::test::BaseFixture { - std::vector vector_from_string(char const* str) const + [[nodiscard]] std::vector vector_from_string(std::string const str) const { - return std::vector(reinterpret_cast(str), - reinterpret_cast(str) + strlen(str)); + return {reinterpret_cast(str.c_str()), + reinterpret_cast(str.c_str()) + strlen(str.c_str())}; } - void Decompress(std::vector* decompressed, + void Decompress(std::vector& decompressed, uint8_t const* compressed, size_t compressed_size) { auto stream = cudf::get_default_stream(); rmm::device_buffer src{compressed, compressed_size, stream}; - rmm::device_uvector dst{decompressed->size(), stream}; + rmm::device_uvector dst{decompressed.size(), stream}; cudf::detail::hostdevice_vector> inf_in(1, stream); inf_in[0] = {static_cast(src.data()), src.size()}; @@ -67,7 +67,7 @@ struct DecompressTest : public cudf::test::BaseFixture { static_cast(this)->dispatch(inf_in, inf_out, inf_stat); CUDF_CUDA_TRY(cudaMemcpyAsync( - decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); + decompressed.data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); inf_stat.device_to_host_sync(stream); ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS); } @@ -125,49 +125,57 @@ struct NvcompConfigTest : public cudf::test::BaseFixture {}; TEST_F(GzipDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0x1f, 0x8b, 0x8, 0x0, 0x9, 0x63, 0x99, 0x5c, 0x2, 0xff, 0xcb, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0x1, 0x0, 0x85, 0x11, 0x4a, 0xd, 0xb, 0x0, 0x0, 0x0}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(SnappyDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0xb, 0x28, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(SnappyDecompressTest, ShortLiteralAfterLongCopyAtStartup) { - constexpr char uncompressed[] = "Aaaaaaaaaaaah!"; + std::string const uncompressed{"Aaaaaaaaaaaah!"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = {14, 0x0, 'A', 0x0, 'a', (10 - 4) * 4 + 1, 1, 0x4, 'h', '!'}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(BrotliDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0xb, 0x5, 0x80, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x3}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index dc14824d834..cc1e367d114 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -17,14 +17,12 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include @@ -32,18 +30,12 @@ #include #include #include -#include -#include #include -#include #include -#include - #include #include -#include #include #include #include @@ -63,9 +55,9 @@ auto dtype() template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using column = cudf::column; using table = cudf::table; using table_view = cudf::table_view; @@ -954,7 +946,7 @@ TEST_F(CsvReaderTest, Strings) ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( - std::vector{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}, + std::vector{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"}, view.column(1)); } @@ -1014,7 +1006,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( - std::vector{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"}, + std::vector{"\"abcdef ghi\"", R"("jkl ""mno"" pqr")", "stu \"vwx\" yz"}, view.column(1)); } @@ -1830,7 +1822,7 @@ TEST_F(CsvReaderTest, StringsWithWriter) auto int_column = column_wrapper{10, 20, 30}; auto string_column = - column_wrapper{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}; + column_wrapper{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"}; cudf::table_view input_table(std::vector{int_column, string_column}); // TODO add quoting style flag? @@ -2516,4 +2508,39 @@ TEST_F(CsvReaderTest, UTF8BOM) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected); } +void expect_buffers_equal(cudf::io::datasource::buffer* lhs, cudf::io::datasource::buffer* rhs) +{ + ASSERT_EQ(lhs->size(), rhs->size()); + EXPECT_EQ(0, std::memcmp(lhs->data(), rhs->data(), lhs->size())); +} + +TEST_F(CsvReaderTest, OutOfMapBoundsReads) +{ + // write a lot of data into a file + auto filepath = temp_env->get_temp_dir() + "OutOfMapBoundsReads.csv"; + auto const num_rows = 1 << 20; + auto const row = std::string{"0,1,2,3,4,5,6,7,8,9\n"}; + auto const file_size = num_rows * row.size(); + { + std::ofstream outfile(filepath, std::ofstream::out); + for (size_t i = 0; i < num_rows; ++i) { + outfile << row; + } + } + + // Only memory map the middle of the file + auto source = cudf::io::datasource::create(filepath, file_size / 2, file_size / 4); + auto full_source = cudf::io::datasource::create(filepath); + auto const all_data = source->host_read(0, file_size); + auto ref_data = full_source->host_read(0, file_size); + expect_buffers_equal(ref_data.get(), all_data.get()); + + auto const start_data = source->host_read(file_size / 2, file_size / 2); + expect_buffers_equal(full_source->host_read(file_size / 2, file_size / 2).get(), + start_data.get()); + + auto const end_data = source->host_read(0, file_size / 2 + 512); + expect_buffers_equal(full_source->host_read(0, file_size / 2 + 512).get(), end_data.get()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp index 3c41f21b0a4..1b85541687a 100644 --- a/cpp/tests/io/file_io_test.cpp +++ b/cpp/tests/io/file_io_test.cpp @@ -15,13 +15,10 @@ */ #include -#include #include #include -#include - // Base test fixture for tests struct CuFileIOTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/json/json_chunked_reader.cu b/cpp/tests/io/json/json_chunked_reader.cu index c9ee6542a4d..9c8208b8300 100644 --- a/cpp/tests/io/json/json_chunked_reader.cu +++ b/cpp/tests/io/json/json_chunked_reader.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "io/comp/comp.hpp" #include "json_utils.cuh" #include @@ -31,36 +32,66 @@ /** * @brief Base test fixture for JSON reader tests */ -struct JsonReaderTest : public cudf::test::BaseFixture {}; +struct JsonReaderTest : public cudf::test::BaseFixture, + public testing::WithParamInterface {}; + +// Parametrize qualifying JSON tests for multiple compression types +INSTANTIATE_TEST_SUITE_P(JsonReaderTest, + JsonReaderTest, + ::testing::Values(cudf::io::compression_type::GZIP, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::NONE)); cudf::test::TempDirTestEnvironment* const temp_env = static_cast( ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); -TEST_F(JsonReaderTest, ByteRange_SingleSource) +TEST_P(JsonReaderTest, ByteRange_SingleSource) { + cudf::io::compression_type const comptype = GetParam(); + std::string const json_string = R"( { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } { "a": { "y" : 6}, "b" : [6 ], "c": 13 } { "a": { "y" : 6}, "b" : [7 ], "c": 14 })"; + std::vector cdata; + if (comptype != cudf::io::compression_type::NONE) { + cdata = cudf::io::detail::compress( + comptype, + cudf::host_span(reinterpret_cast(json_string.data()), + json_string.size()), + cudf::get_default_stream()); + } else + cdata = std::vector( + reinterpret_cast(json_string.data()), + reinterpret_cast(json_string.data()) + json_string.size()); + // Initialize parsing options (reading json lines) cudf::io::json_reader_options json_lines_options = cudf::io::json_reader_options::builder( cudf::io::source_info{json_string.c_str(), json_string.size()}) .compression(cudf::io::compression_type::NONE) .lines(true); + cudf::io::json_reader_options cjson_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{cudf::host_span(cdata.data(), cdata.size())}) + .compression(comptype) + .lines(true); // Read full test data via existing, nested JSON lines reader - cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(cjson_lines_options); - auto datasources = cudf::io::datasource::create(json_lines_options.get_source().host_buffers()); + auto datasources = cudf::io::datasource::create(json_lines_options.get_source().host_buffers()); + auto cdatasources = cudf::io::datasource::create(cjson_lines_options.get_source().host_buffers()); // Test for different chunk sizes for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) { auto const tables = split_byte_range_reading(datasources, + cdatasources, json_lines_options, + cjson_lines_options, chunk_size, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); @@ -77,37 +108,54 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource) } } -TEST_F(JsonReaderTest, ReadCompleteFiles) +TEST_P(JsonReaderTest, ReadCompleteFiles) { + cudf::io::compression_type const comptype = GetParam(); + std::string const json_string = R"( { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } { "a": { "y" : 6}, "b" : [6 ], "c": 13 } { "a": { "y" : 6}, "b" : [7 ], "c": 14 })"; - auto filename = temp_env->get_temp_dir() + "ParseInRangeIntegers.json"; + + std::vector cdata; + if (comptype != cudf::io::compression_type::NONE) { + cdata = cudf::io::detail::compress( + comptype, + cudf::host_span(reinterpret_cast(json_string.data()), + json_string.size()), + cudf::get_default_stream()); + } else + cdata = std::vector( + reinterpret_cast(json_string.data()), + reinterpret_cast(json_string.data()) + json_string.size()); + + auto cfilename = temp_env->get_temp_dir() + "cParseInRangeIntegers.json"; { - std::ofstream outfile(filename, std::ofstream::out); - outfile << json_string; + std::ofstream outfile(cfilename, std::ofstream::out); + std::copy(cdata.begin(), cdata.end(), std::ostreambuf_iterator(outfile)); } constexpr int num_sources = 5; - std::vector filepaths(num_sources, filename); + std::vector cfilepaths(num_sources, cfilename); - cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths}) + cudf::io::json_reader_options cin_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{cfilepaths}) .lines(true) + .compression(comptype) .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); - cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + cudf::io::table_with_metadata result = cudf::io::read_json(cin_options); std::vector part_tables; - for (auto filepath : filepaths) { - cudf::io::json_reader_options part_in_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + for (auto cfilepath : cfilepaths) { + cudf::io::json_reader_options part_cin_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{cfilepath}) .lines(true) + .compression(comptype) .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); - part_tables.push_back(cudf::io::read_json(part_in_options)); + part_tables.push_back(cudf::io::read_json(part_cin_options)); } auto part_table_views = std::vector(part_tables.size()); @@ -120,42 +168,69 @@ TEST_F(JsonReaderTest, ReadCompleteFiles) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result.tbl->view(), expected_result->view()); } -TEST_F(JsonReaderTest, ByteRange_MultiSource) +TEST_P(JsonReaderTest, ByteRange_MultiSource) { + cudf::io::compression_type const comptype = GetParam(); + std::string const json_string = R"( { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } { "a": { "y" : 6}, "b" : [6 ], "c": 13 } { "a": { "y" : 6}, "b" : [7 ], "c": 14 })"; - auto filename = temp_env->get_temp_dir() + "ParseInRangeIntegers.json"; + + std::vector cdata; + if (comptype != cudf::io::compression_type::NONE) { + cdata = cudf::io::detail::compress( + comptype, + cudf::host_span(reinterpret_cast(json_string.data()), + json_string.size()), + cudf::get_default_stream()); + } else + cdata = std::vector( + reinterpret_cast(json_string.data()), + reinterpret_cast(json_string.data()) + json_string.size()); + + auto cfilename = temp_env->get_temp_dir() + "cParseInRangeIntegers.json"; { - std::ofstream outfile(filename, std::ofstream::out); - outfile << json_string; + std::ofstream outfile(cfilename, std::ofstream::out); + std::copy(cdata.begin(), cdata.end(), std::ostreambuf_iterator(outfile)); } constexpr int num_sources = 5; - std::vector filepaths(num_sources, filename); + std::vector cfilepaths(num_sources, cfilename); + std::vector> hostbufs( + num_sources, + cudf::host_span(reinterpret_cast(json_string.data()), + json_string.size())); // Initialize parsing options (reading json lines) cudf::io::json_reader_options json_lines_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths}) - .lines(true) + cudf::io::json_reader_options::builder( + cudf::io::source_info{ + cudf::host_span>(hostbufs.data(), hostbufs.size())}) .compression(cudf::io::compression_type::NONE) + .lines(true); + cudf::io::json_reader_options cjson_lines_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{cfilepaths}) + .lines(true) + .compression(comptype) .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); // Read full test data via existing, nested JSON lines reader - cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(cjson_lines_options); - auto file_paths = json_lines_options.get_source().filepaths(); - std::vector> datasources; - for (auto& fp : file_paths) { - datasources.emplace_back(cudf::io::datasource::create(fp)); + std::vector> cdatasources; + for (auto& fp : cfilepaths) { + cdatasources.emplace_back(cudf::io::datasource::create(fp)); } + auto datasources = cudf::io::datasource::create(json_lines_options.get_source().host_buffers()); // Test for different chunk sizes for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) { auto const tables = split_byte_range_reading(datasources, + cdatasources, json_lines_options, + cjson_lines_options, chunk_size, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp index d23acf3ae00..0fbd7da7f4d 100644 --- a/cpp/tests/io/json/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json/json_quote_normalization_test.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -29,14 +28,15 @@ #include #include -#include #include // Base test fixture for tests struct JsonNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) +void run_test(std::string const& host_input, + std::string const& expected_host_output, + char delimiter = '\n') { // RMM memory resource std::shared_ptr rsc = @@ -48,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou // Preprocessing FST cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get()); + cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get()); std::string preprocessed_host_output(device_data.size(), 0); CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), @@ -174,6 +174,13 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces run_test(input, output); } +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter) +{ + std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"}; + std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"}; + run_test(input, output, 'z'); +} + TEST_F(JsonNormalizationTest, ReadJsonOption) { // RMM memory resource @@ -181,22 +188,24 @@ TEST_F(JsonNormalizationTest, ReadJsonOption) std::make_shared(); // Test input - std::string const host_input = R"({"A":'TEST"'})"; + std::string const host_input = R"({"a": "1\n2"}h{'a': 12})"; cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) .lines(true) + .delimiter('h') .normalize_single_quotes(true); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); // Expected table - std::string const expected_input = R"({"A":"TEST\""})"; + std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})"; cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) - .lines(true); + .lines(true) + .delimiter('h'); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get()); diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index a094ac7d772..3c8db99c3c7 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -14,6 +14,9 @@ * limitations under the License. */ +#include "io/comp/comp.hpp" +#include "io/comp/io_uncomp.hpp" + #include #include #include @@ -39,8 +42,6 @@ #include -#include - #include #include #include @@ -68,9 +69,9 @@ auto dtype() template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; cudf::test::TempDirTestEnvironment* const temp_env = static_cast( @@ -241,7 +242,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest(), scale}}) + .dtypes(std::vector{data_type{type_to_id(), scale}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -326,7 +327,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -350,7 +351,8 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) - .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) + .dtypes(std::map{ + {"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -468,7 +470,7 @@ TEST_P(JsonReaderParamTest, Booleans) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -510,7 +512,7 @@ TEST_P(JsonReaderParamTest, Dates) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) + .dtypes(std::vector{data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) .dayfirst(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -566,7 +568,7 @@ TEST_P(JsonReaderParamTest, Durations) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) + .dtypes(std::vector{data_type{type_id::DURATION_NANOSECONDS}}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -858,8 +860,7 @@ TEST_P(JsonReaderRecordTest, JsonLinesObjects) TEST_P(JsonReaderRecordTest, JsonLinesObjectsStrings) { - auto const test_opt = GetParam(); - auto test_json_objects = [test_opt](std::string const& data) { + auto test_json_objects = [](std::string const& data) { cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) .lines(true); @@ -1025,7 +1026,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -1464,7 +1465,7 @@ TEST_F(JsonReaderTest, ErrorStrings) cudf::io::json_reader_options const in_opts = cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({data_type{cudf::type_id::STRING}}) + .dtypes(std::vector{data_type{cudf::type_id::STRING}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -1852,7 +1853,7 @@ TYPED_TEST(JsonFixedPointReaderTest, EmptyValues) cudf::io::json_reader_options const in_opts = cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({data_type{type_to_id(), 0}}) + .dtypes(std::vector{data_type{type_to_id(), 0}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -2830,7 +2831,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); // types EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); @@ -2868,7 +2869,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); // types EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); @@ -2887,9 +2888,9 @@ TEST_F(JsonReaderTest, MixedTypesWithSchema) std::map data_types; std::map child_types; child_types.insert( - std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}}); - data_types.insert(std::pair{ - "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}}); + std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING}, {}}}); + data_types.insert( + std::pair{"data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST}, child_types}}); cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) @@ -2976,4 +2977,337 @@ TEST_F(JsonReaderTest, JsonDtypeSchema) cudf::test::debug_output_level::ALL_ERRORS); } +TEST_F(JsonReaderTest, LastRecordInvalid) +{ + std::string data = R"({"key": "1"} + {"key": "})"; + std::map schema{{"key", {dtype()}}}; + auto opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(schema) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .build(); + auto const result = cudf::io::read_json(opts); + + EXPECT_EQ(result.metadata.schema_info[0].name, "key"); + cudf::test::strings_column_wrapper expected{{"1", ""}, cudf::test::iterators::nulls_at({1})}; + CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), cudf::table_view{{expected}}); +} + +// Test case for dtype pruning with column order +TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) +{ + std::string json_stringl = R"( + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {}} + {"a": 1, "c": null} + )"; + std::string json_string = R"([ + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}, + {"a": 1, "b": {"0": "abc" }, "c": false}, + {"a": 1, "b": {}}, + {"a": 1, "c": null} + ])"; + for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .prune_columns(true) + .lines(lines); + + // include all columns + //// schema with partial ordering + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"0", "1"}}}}, + {"a", {dtype()}}, + {"c", {dtype()}}, + }, + {{"b", "a", "c"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + EXPECT_EQ(result.metadata.schema_info[1].name, "a"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // "b" children checks + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].children[1].name, "element"); + // types + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(1).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(1).child(1).type().id(), cudf::type_id::FLOAT32); + } + //// schema with pruned columns and different order. + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"c", {dtype()}}, + {"b", + { + data_type{cudf::type_id::STRUCT}, + }}, + {"a", {dtype()}}, + }, + {{"c", "b", "a"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // "c", "b" and "a" order + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "c"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "a"); + // pruned + EXPECT_EQ(result.metadata.schema_info[1].children.size(), 0); + } + //// schema with pruned columns and different sub-order. + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"c", {dtype()}}, + {"b", + {data_type{cudf::type_id::STRUCT}, + // {}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"1", "0"}}}}, + {"a", {dtype()}}, + }}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Order of occurance in json + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // Sub-order of "b" + EXPECT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "1"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "0"); + } + //// schema with 1 dtype, but 2 column order + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"a", "b"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Input schema column order size mismatch with input schema child types + } + //// repetition, Error + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"a", "a"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Input schema column order size mismatch with input schema child types + } + //// different column name in order, Error + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"b"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Column name not found in input schema map, but present in column order and + // prune_columns is enabled + } + // include only one column (nested) + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"1"}}}}, + }}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "b":"1":[float] + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element"); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32); + } + // multiple - all present + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + {"c", {dtype()}}, + }, + {{"a", "c"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", and "c" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "c"); + } + // multiple - not all present + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + {"d", {dtype()}}, + }, + {{"a", "d"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "d"); + auto all_null_bools = + cudf::test::fixed_width_column_wrapper{{true, true, true, true}, {0, 0, 0, 0}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_bools); + } + // test struct, list of string, list of struct. + // multiple - not all present nested + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + { + {"2", {data_type{cudf::type_id::STRING}}}, + }, + {{"2"}}}}, + {"d", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}, + {"e", + {data_type{cudf::type_id::LIST}, + {{"element", + { + data_type{cudf::type_id::STRUCT}, + { + {"3", {data_type{cudf::type_id::STRING}}}, + }, //{{"3"}} missing column_order, but output should not have it. + }}}}}, + }, + {{"b", "d", "e"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "b" (empty struct) and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + ASSERT_EQ(result.metadata.schema_info[0].children[0].name, "2"); + EXPECT_EQ(result.metadata.schema_info[1].name, "d"); + auto all_null_strings = cudf::test::strings_column_wrapper{{"", "", "", ""}, {0, 0, 0, 0}}; + EXPECT_EQ(result.tbl->get_column(0).num_children(), 1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(0), all_null_strings); + auto const all_null_list = cudf::test::lists_column_wrapper{ + {{0, 0}, {1, 1}, {2, 2}, {3, 3}}, cudf::test::iterators::all_nulls()}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_list); + EXPECT_EQ(result.metadata.schema_info[2].name, "e"); + ASSERT_EQ(result.metadata.schema_info[2].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[2].children[1].children.size(), 0); + // ASSERT_EQ(result.metadata.schema_info[2].children[1].children[0].name, "3"); + auto empty_string_col = cudf::test::strings_column_wrapper{}; + cudf::test::structs_column_wrapper expected_structs{{}, cudf::test::iterators::all_nulls()}; + // make all null column of list of struct of string + auto wrapped = make_lists_column( + 4, + cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 0}.release(), + expected_structs.release(), + 4, + cudf::create_null_mask(4, cudf::mask_state::ALL_NULL)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped); + } + } +} + +struct JsonCompressedIOTest : public cudf::test::BaseFixture, + public testing::WithParamInterface {}; + +// Parametrize qualifying JSON tests for multiple compression types +INSTANTIATE_TEST_SUITE_P(JsonCompressedIOTest, + JsonCompressedIOTest, + ::testing::Values(cudf::io::compression_type::GZIP, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::NONE)); + +TEST_P(JsonCompressedIOTest, BasicJsonLines) +{ + cudf::io::compression_type const comptype = GetParam(); + std::string data = to_records_orient( + {{{"0", "1"}, {"1", "1.1"}}, {{"0", "2"}, {"1", "2.2"}}, {{"0", "3"}, {"1", "3.3"}}}, "\n"); + + std::vector cdata; + if (comptype != cudf::io::compression_type::NONE) { + cdata = cudf::io::detail::compress( + comptype, + cudf::host_span(reinterpret_cast(data.data()), data.size()), + cudf::get_default_stream()); + auto decomp_out_buffer = cudf::io::detail::decompress( + comptype, cudf::host_span(cdata.data(), cdata.size())); + std::string const expected = R"({"0":1, "1":1.1} +{"0":2, "1":2.2} +{"0":3, "1":3.3})"; + EXPECT_EQ( + expected, + std::string(reinterpret_cast(decomp_out_buffer.data()), decomp_out_buffer.size())); + } else + cdata = std::vector(reinterpret_cast(data.data()), + reinterpret_cast(data.data()) + data.size()); + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{cudf::host_span(cdata.data(), cdata.size())}) + .dtypes(std::vector{dtype(), dtype()}) + .compression(comptype) + .lines(true); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 2); + EXPECT_EQ(result.tbl->num_rows(), 3); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); + + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2, 3}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}}); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 15682c6ae6b..887d4fa783f 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -15,12 +15,8 @@ */ #include "io/json/nested_json.hpp" -#include "io/utilities/hostdevice_vector.hpp" #include -#include -#include -#include #include #include @@ -29,9 +25,9 @@ #include #include -#include #include +#include #include #include #include diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh index 9383797d91b..629a89fa777 100644 --- a/cpp/tests/io/json/json_utils.cuh +++ b/cpp/tests/io/json/json_utils.cuh @@ -32,7 +32,9 @@ template std::vector split_byte_range_reading( cudf::host_span> sources, + cudf::host_span> csources, cudf::io::json_reader_options const& reader_opts, + cudf::io::json_reader_options const& creader_opts, IndexType chunk_size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -49,9 +51,9 @@ std::vector split_byte_range_reading( rmm::device_uvector buffer(total_source_size, stream); auto readbufspan = cudf::io::json::detail::ingest_raw_input(buffer, sources, - reader_opts.get_compression(), reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size(), + reader_opts.get_delimiter(), stream); // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the // return type of that function is size_type. However, when the chunk_size is @@ -94,10 +96,11 @@ std::vector split_byte_range_reading( record_ranges.emplace_back(prev, total_source_size); std::vector tables; + auto creader_opts_chunk = creader_opts; for (auto const& [chunk_start, chunk_end] : record_ranges) { - reader_opts_chunk.set_byte_range_offset(chunk_start); - reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start); - tables.push_back(cudf::io::json::detail::read_json(sources, reader_opts_chunk, stream, mr)); + creader_opts_chunk.set_byte_range_offset(chunk_start); + creader_opts_chunk.set_byte_range_size(chunk_end - chunk_start); + tables.push_back(cudf::io::json::detail::read_json(csources, creader_opts_chunk, stream, mr)); } // assume all records have same number of columns, and inferred same type. (or schema is passed) // TODO a step before to merge all columns, types and infer final schema. diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp index 2c4e29a01b9..b96fc6425e4 100644 --- a/cpp/tests/io/json/json_writer.cpp +++ b/cpp/tests/io/json/json_writer.cpp @@ -14,10 +14,14 @@ * limitations under the License. */ +#include "io/comp/io_uncomp.hpp" + #include #include +#include #include #include +#include #include #include @@ -31,7 +35,36 @@ struct JsonWriterTest : public cudf::test::BaseFixture {}; -TEST_F(JsonWriterTest, EmptyInput) +/** + * @brief Test fixture for parametrized JSON reader tests + */ +struct JsonCompressedWriterTest : public cudf::test::BaseFixture, + public testing::WithParamInterface {}; + +// Parametrize qualifying JSON tests for multiple compression types +INSTANTIATE_TEST_SUITE_P(JsonCompressedWriterTest, + JsonCompressedWriterTest, + ::testing::Values(cudf::io::compression_type::GZIP, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::NONE)); + +void run_test(cudf::io::json_writer_options const& wopts, std::string const& expected) +{ + auto outbuf = wopts.get_sink().buffers().front(); + auto comptype = wopts.get_compression(); + cudf::io::write_json(wopts, cudf::test::get_default_stream()); + if (comptype != cudf::io::compression_type::NONE) { + auto decomp_out_buffer = cudf::io::detail::decompress( + comptype, + cudf::host_span(reinterpret_cast(outbuf->data()), outbuf->size())); + EXPECT_EQ(expected, + std::string_view(reinterpret_cast(decomp_out_buffer.data()), + decomp_out_buffer.size())); + } else + EXPECT_EQ(expected, std::string_view(outbuf->data(), outbuf->size())); +} + +TEST_P(JsonCompressedWriterTest, EmptyInput) { cudf::test::strings_column_wrapper col1; cudf::test::strings_column_wrapper col2; @@ -49,25 +82,50 @@ TEST_F(JsonWriterTest, EmptyInput) .lines(false) .na_rep("null") .build(); - - // Empty columns in table - cudf::io::write_json(out_options, cudf::test::get_default_stream()); - std::string const expected = R"([])"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, "[]"); // Empty columns in table - JSON Lines out_buffer.clear(); out_options.enable_lines(true); - cudf::io::write_json(out_options, cudf::test::get_default_stream()); - std::string const expected_lines = "\n"; - EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, "\n"); // Empty table - JSON Lines cudf::table_view tbl_view2{}; out_options.set_table(tbl_view2); out_buffer.clear(); - cudf::io::write_json(out_options, cudf::test::get_default_stream()); - EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, "\n"); +} + +TEST_P(JsonCompressedWriterTest, EmptyLeaf) +{ + cudf::test::strings_column_wrapper col1{""}; + cudf::test::fixed_width_column_wrapper offsets{0, 0}; + auto col2 = make_lists_column(1, + offsets.release(), + cudf::test::strings_column_wrapper{}.release(), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); + auto col3 = cudf::test::lists_column_wrapper::make_one_empty_row_column(); + cudf::table_view tbl_view{{col1, *col2, col3}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + run_test(out_options, R"([{"col1":"","col2":[],"col3":[]}])"); + + // Empty columns in table - JSON Lines + out_buffer.clear(); + out_options.enable_lines(true); + std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})" + "\n"; + run_test(out_options, expected_lines); } TEST_F(JsonWriterTest, ErrorCases) @@ -104,33 +162,34 @@ TEST_F(JsonWriterTest, ErrorCases) cudf::logic_error); } -TEST_F(JsonWriterTest, PlainTable) +TEST_P(JsonCompressedWriterTest, PlainTable) { + cudf::io::compression_type const comptype = GetParam(); cudf::test::strings_column_wrapper col1{"a", "b", "c"}; cudf::test::strings_column_wrapper col2{"d", "e", "f"}; - cudf::test::fixed_width_column_wrapper col3{1, 2, 3}; - cudf::test::fixed_width_column_wrapper col4{1.5, 2.5, 3.5}; - cudf::test::fixed_width_column_wrapper col5{{1, 2, 3}, + cudf::test::fixed_width_column_wrapper col3{1, 2, 3}; + cudf::test::fixed_width_column_wrapper col4{1.5, 2.5, 3.5}; + cudf::test::fixed_width_column_wrapper col5{{1, 2, 3}, cudf::test::iterators::nulls_at({0, 2})}; cudf::table_view tbl_view{{col1, col2, col3, col4, col5}}; - cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}, {"col4"}, {"col5"}}}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(true) - .metadata(mt) - .lines(false) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .compression(comptype) + .na_rep("null") + .build(); std::string const expected = - R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + R"([{"col1":"a","col2":"d","col3":1,"col4":1.5,"col5":null},{"col1":"b","col2":"e","col3":2,"col4":2.5,"col5":2},{"col1":"c","col2":"f","col3":3,"col4":3.5,"col5":null}])"; + run_test(out_options, expected); } -TEST_F(JsonWriterTest, SimpleNested) +TEST_P(JsonCompressedWriterTest, SimpleNested) { std::string const data = R"( {"a": 1, "b": 2, "c": {"d": 3 }, "f": 5.5, "g": [1]} @@ -146,23 +205,23 @@ TEST_F(JsonWriterTest, SimpleNested) cudf::io::table_metadata mt{result.metadata}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(false) - .metadata(mt) - .lines(true) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); + std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} {"a":6,"b":7,"c":{"d":8},"f":10.5} {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} {"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[3,4,5]} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, MixedNested) +TEST_P(JsonCompressedWriterTest, MixedNested) { std::string const data = R"( {"a": 1, "b": 2, "c": {"d": [3] }, "f": 5.5, "g": [ {"h": 1}]} @@ -178,20 +237,20 @@ TEST_F(JsonWriterTest, MixedNested) cudf::io::table_metadata mt{result.metadata}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(false) - .metadata(mt) - .lines(false) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + std::string const expected = R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)" R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)" R"({"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[{"h":2},null]},)" R"({"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[{"h":3},{"h":4},{"h":5}]}])"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } TEST_F(JsonWriterTest, WriteReadNested) @@ -338,7 +397,7 @@ TEST_F(JsonWriterTest, WriteReadNested) } } -TEST_F(JsonWriterTest, SpecialChars) +TEST_P(JsonCompressedWriterTest, SpecialChars) { cudf::test::fixed_width_column_wrapper a{1, 6, 1, 6}; cudf::test::strings_column_wrapper b{"abcd", "b\b\f\n\r\t", "\"c\"", "/\\"}; @@ -354,17 +413,15 @@ TEST_F(JsonWriterTest, SpecialChars) .na_rep("null") .build(); - cudf::io::write_json(out_options, cudf::test::get_default_stream()); std::string const expected = R"({"\"a\"":1,"'b'":"abcd"} {"\"a\"":6,"'b'":"b\b\f\n\r\t"} {"\"a\"":1,"'b'":"\"c\""} {"\"a\"":6,"'b'":"\/\\"} )"; - auto const output_string = std::string(out_buffer.data(), out_buffer.size()); - EXPECT_EQ(expected, output_string); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, NullList) +TEST_P(JsonCompressedWriterTest, NullList) { std::string const data = R"( {"a": [null], "b": [[1, 2, 3], [null], [null, null, null], [4, null, 5]]} @@ -380,23 +437,23 @@ TEST_F(JsonWriterTest, NullList) cudf::io::table_metadata mt{result.metadata}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(true) - .metadata(mt) - .lines(true) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); + std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]} {"a":[2,null,null,3],"b":null} {"a":[null,null,4],"b":[[2,null],null]} {"a":[5,null,null],"b":[null,[3,4,5]]} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, ChunkedNested) +TEST_P(JsonCompressedWriterTest, ChunkedNested) { std::string const data = R"( {"a": 1, "b": -2, "c": { }, "e": [{"f": 1}]} @@ -418,15 +475,15 @@ TEST_F(JsonWriterTest, ChunkedNested) cudf::io::table_metadata mt{result.metadata}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(false) - .metadata(mt) - .lines(true) - .na_rep("null") - .rows_per_chunk(8); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null") + .rows_per_chunk(8) + .build(); + std::string const expected = R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]} {"a":2,"b":-2,"c":{}} @@ -438,10 +495,10 @@ TEST_F(JsonWriterTest, ChunkedNested) {"a":8,"b":-2,"c":{"d":64},"e":[{"f":8}]} {"a":9,"b":-2,"c":{"d":81},"e":[{"f":9}]} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, StructAllNullCombinations) +TEST_P(JsonCompressedWriterTest, StructAllNullCombinations) { auto const_1_iter = thrust::make_constant_iterator(1); @@ -475,14 +532,14 @@ TEST_F(JsonWriterTest, StructAllNullCombinations) cudf::io::table_metadata mt{{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}}}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(false) - .metadata(mt) - .lines(true) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); + std::string const expected = R"({} {"e":1} {"d":1} @@ -516,10 +573,10 @@ TEST_F(JsonWriterTest, StructAllNullCombinations) {"a":1,"b":1,"c":1,"d":1} {"a":1,"b":1,"c":1,"d":1,"e":1} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, Unicode) +TEST_P(JsonCompressedWriterTest, Unicode) { // UTF-8, UTF-16 cudf::test::strings_column_wrapper col1{"\"\\/\b\f\n\r\t", "ராபிட்ஸ்", "$€𐐷𤭢", "C𝞵𝓓𝒻"}; @@ -537,14 +594,13 @@ TEST_F(JsonWriterTest, Unicode) cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(true) - .metadata(mt) - .lines(true) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); std::string const expected = R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null} @@ -552,7 +608,7 @@ TEST_F(JsonWriterTest, Unicode) {"col1":"$\u20ac\ud801\udc37\ud852\udf62","col2":"\ud841\ude28\ud846\udd4c\ud849\uddc9\ud84c\uddca\ud850\udea9\ud854\udd7d\ud858\ude71\ud85f\udd31\ud860\udc72\ud864\udc79\ud869\udc22\ud86c\udded\ud872\udf2d\ud877\udeb7\ud878\udea6\u5c6e","int16":null} {"col1":"C\ud835\udfb5\ud835\udcd3\ud835\udcbb","col2":"\ud883\udf91\ud885\udd08\ud888\udf49","int16":4} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index f32aba0e632..e0e955c4f48 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -21,24 +21,16 @@ #include #include #include -#include #include -#include #include -#include #include -#include #include -#include #include #include #include #include -#include - -#include #include #include diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp index 84f04f67038..380d66c53f9 100644 --- a/cpp/tests/io/metadata_utilities.cpp +++ b/cpp/tests/io/metadata_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,9 @@ * limitations under the License. */ +#include #include -#include - namespace cudf::test { void expect_metadata_equal(cudf::io::table_input_metadata in_meta, diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu index 8ad1fea649d..5f1aea71f73 100644 --- a/cpp/tests/io/orc_chunked_reader_test.cu +++ b/cpp/tests/io/orc_chunked_reader_test.cu @@ -1358,10 +1358,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow) int64_t constexpr total_rows = num_rows * num_reps; static_assert(total_rows > std::numeric_limits::max()); - auto const it = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) { - return (i % num_rows) % static_cast(std::numeric_limits::max() / 2); - }); - auto const col = data_col(it, it + num_rows); + auto const it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [num_rows](int64_t i) { + return (i % num_rows) % static_cast(std::numeric_limits::max() / 2); + }); + auto const col = data_col(it, it + num_rows); auto const chunk_table = cudf::table_view{{col}}; std::vector data_buffer; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 39ba62952b4..fce99187516 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -31,20 +31,20 @@ #include #include #include -#include #include #include #include #include +#include #include template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using str_col = column_wrapper; using bool_col = column_wrapper; @@ -767,14 +767,14 @@ TEST_F(OrcChunkedWriterTest, Metadata) TEST_F(OrcChunkedWriterTest, Strings) { - bool mask1[] = {true, true, false, true, true, true, true}; + std::array mask1{true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - str_col strings1(h_strings1.begin(), h_strings1.end(), mask1); + str_col strings1(h_strings1.begin(), h_strings1.end(), mask1.data()); table_view tbl1({strings1}); - bool mask2[] = {false, true, true, true, true, true, true}; + std::array mask2{false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - str_col strings2(h_strings2.begin(), h_strings2.end(), mask2); + str_col strings2(h_strings2.begin(), h_strings2.end(), mask2.data()); table_view tbl2({strings2}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -877,26 +877,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; - int num_els = 31; + constexpr int num_els{31}; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); table_view tbl1({c1a_w, c1b_w}); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -920,26 +920,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; - int num_els = 33; + constexpr int num_els = 33; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); table_view tbl1({c1a_w, c1b_w}); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -1140,7 +1140,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression) } // Test with zstd compressed orc file with high compression ratio. - constexpr uint8_t input_buffer[] = { + constexpr std::array input_buffer{ 0x4f, 0x52, 0x43, 0x5a, 0x00, 0x00, 0x28, 0xb5, 0x2f, 0xfd, 0xa4, 0x34, 0xc7, 0x03, 0x00, 0x74, 0x00, 0x00, 0x18, 0x41, 0xff, 0xaa, 0x02, 0x00, 0xbb, 0xff, 0x45, 0xc8, 0x01, 0x25, 0x30, 0x04, 0x65, 0x00, 0x00, 0x10, 0xaa, 0x1f, 0x02, 0x00, 0x01, 0x29, 0x0b, 0xc7, 0x39, 0xb8, 0x02, 0xcb, @@ -1154,7 +1154,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression) 0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17}; auto source = - cudf::io::source_info(reinterpret_cast(input_buffer), sizeof(input_buffer)); + cudf::io::source_info(reinterpret_cast(input_buffer.data()), input_buffer.size()); cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder(source).use_index(false); @@ -1357,21 +1357,22 @@ TEST_P(OrcWriterTestStripes, StripeSize) cols.push_back(col.release()); auto const expected = std::make_unique
(std::move(cols)); - auto validate = [&](std::vector const& orc_buffer) { - auto const expected_stripe_num = - std::max(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes); - auto const stats = cudf::io::read_parsed_orc_statistics( - cudf::io::source_info(orc_buffer.data(), orc_buffer.size())); - EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num); - - cudf::io::orc_reader_options in_opts = - cudf::io::orc_reader_options::builder( - cudf::io::source_info(orc_buffer.data(), orc_buffer.size())) - .use_index(false); - auto result = cudf::io::read_orc(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - }; + auto validate = + [&, &size_bytes = size_bytes, &size_rows = size_rows](std::vector const& orc_buffer) { + auto const expected_stripe_num = + std::max(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes); + auto const stats = cudf::io::read_parsed_orc_statistics( + cudf::io::source_info(orc_buffer.data(), orc_buffer.size())); + EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num); + + cudf::io::orc_reader_options in_opts = + cudf::io::orc_reader_options::builder( + cudf::io::source_info(orc_buffer.data(), orc_buffer.size())) + .use_index(false); + auto result = cudf::io::read_orc(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + }; { std::vector out_buffer_chunked; diff --git a/cpp/tests/io/parquet_chunked_writer_test.cpp b/cpp/tests/io/parquet_chunked_writer_test.cpp index 282c6f3adad..810fee89c48 100644 --- a/cpp/tests/io/parquet_chunked_writer_test.cpp +++ b/cpp/tests/io/parquet_chunked_writer_test.cpp @@ -124,15 +124,15 @@ TEST_F(ParquetChunkedWriterTest, Strings) { std::vector> cols; - bool mask1[] = {true, true, false, true, true, true, true}; + std::array mask1{true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); + cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1.data()); cols.push_back(strings1.release()); cudf::table tbl1(std::move(cols)); - bool mask2[] = {false, true, true, true, true, true, true}; + std::array mask2{false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); + cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2.data()); cols.push_back(strings2.release()); cudf::table tbl2(std::move(cols)); @@ -771,29 +771,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; - int num_els = 31; + constexpr int num_els = 31; std::vector> cols; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + true, true, true, true, true, true, true, true, true}; + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); cols.push_back(c1a_w.release()); cols.push_back(c1b_w.release()); cudf::table tbl1(std::move(cols)); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); cols.push_back(c2a_w.release()); cols.push_back(c2b_w.release()); cudf::table tbl2(std::move(cols)); @@ -819,29 +819,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; - int num_els = 33; + constexpr int num_els = 33; std::vector> cols; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); cols.push_back(c1a_w.release()); cols.push_back(c1b_w.release()); cudf::table tbl1(std::move(cols)); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); cols.push_back(c2a_w.release()); cols.push_back(c2b_w.release()); cudf::table tbl2(std::move(cols)); diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index 3dd5ad145ea..a1b8677eac8 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -483,10 +483,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> ascending() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", i); - return std::string(buf); + sprintf(buf.data(), "%09d", i); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -495,10 +495,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> descending() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", num_ordered_rows - i); - return std::string(buf); + sprintf(buf.data(), "%09d", static_cast(num_ordered_rows - i)); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -507,10 +507,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> unordered() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); - return std::string(buf); + sprintf(buf.data(), "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2) int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cuda::std::optional const& ctype) + std::optional const& ctype) { auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN); switch (ptype) { diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp index bc6145d77da..d66aa3bde9d 100644 --- a/cpp/tests/io/parquet_common.hpp +++ b/cpp/tests/io/parquet_common.hpp @@ -22,22 +22,20 @@ #include #include -#include #include #include #include #include -#include #include #include template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using column = cudf::column; using table = cudf::table; using table_view = cudf::table_view; @@ -172,7 +170,7 @@ std::pair create_parquet_typed_with_stats(std::string int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cuda::std::optional const& ctype); + std::optional const& ctype); void expect_compression_stats_empty(std::shared_ptr stats); diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index 01027d04658..d66f685cd9c 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -20,8 +20,8 @@ #include #include -#include -#include + +#include //////////////////////////////// // delta encoding writer tests @@ -96,7 +96,7 @@ TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) // list constexpr int vals_per_row = 4; auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( - 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); + 0, [](cudf::size_type idx) { return idx * vals_per_row; }); cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, c1_offset_iter + num_rows + 1); cudf::test::fixed_width_column_wrapper c1_vals( @@ -225,10 +225,9 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { - cudf::io::parquet::detail::BoundaryOrder::ASCENDING, - cudf::io::parquet::detail::BoundaryOrder::DESCENDING, - cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; + std::array expected_orders{cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 6c61535359f..177e6163d4f 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -29,6 +29,10 @@ #include #include +#include + +#include + TEST_F(ParquetReaderTest, UserBounds) { // trying to read more rows than there are should result in @@ -569,7 +573,8 @@ TEST_F(ParquetReaderTest, DecimalRead) This test is a temporary test until python gains the ability to write decimal, so we're embedding a parquet file directly into the code here to prevent issues with finding the file */ - unsigned char const decimals_parquet[] = { + constexpr unsigned int decimals_parquet_len = 2366; + std::array const decimals_parquet{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00, @@ -728,10 +733,10 @@ TEST_F(ParquetReaderTest, DecimalRead) 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30, 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - unsigned int decimals_parquet_len = 2366; - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(decimals_parquet), decimals_parquet_len}); + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(decimals_parquet.data()), decimals_parquet_len}); auto result = cudf::io::read_parquet(read_opts); auto validity = @@ -739,7 +744,7 @@ TEST_F(ParquetReaderTest, DecimalRead) EXPECT_EQ(result.tbl->view().num_columns(), 3); - int32_t col0_data[] = { + std::array col0_data{ -2354584, -190275, 8393572, 6446515, -5687920, -1843550, -6897687, -6780385, 3428529, 5842056, -4312278, -4450603, -7516141, 2974667, -4288640, 1065090, -9410428, 7891355, 1076244, -1975984, 6999466, 2666959, 9262967, 7931374, -1370640, 451074, 8799111, @@ -753,29 +758,28 @@ TEST_F(ParquetReaderTest, DecimalRead) std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - int64_t col1_data[] = {29274040266581, -17210335917753, -58420730139037, - 68073792696254, 2236456014294, 13704555677045, - -70797090469548, -52248605513407, -68976081919961, - -34277313883112, 97774730521689, 21184241014572, - -670882460254, -40862944054399, -24079852370612, - -88670167797498, -84007574359403, -71843004533519, - -55538016554201, 3491435293032, -29085437167297, - 36901882672273, -98622066122568, -13974902998457, - 86712597643378, -16835133643735, -94759096142232, - 30708340810940, 79086853262082, 78923696440892, - -76316597208589, 37247268714759, 80303592631774, - 57790350050889, 19387319851064, -33186875066145, - 69701203023404, -7157433049060, -7073790423437, - 92769171617714, -75127120182184, -951893180618, - 64927618310150, -53875897154023, -16168039035569, - -24273449166429, -30359781249192, 35639397345991, - 45844829680593, 71401416837149, 0, - -99999999999999, 99999999999999}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); + std::array col1_data{29274040266581, -17210335917753, -58420730139037, + 68073792696254, 2236456014294, 13704555677045, + -70797090469548, -52248605513407, -68976081919961, + -34277313883112, 97774730521689, 21184241014572, + -670882460254, -40862944054399, -24079852370612, + -88670167797498, -84007574359403, -71843004533519, + -55538016554201, 3491435293032, -29085437167297, + 36901882672273, -98622066122568, -13974902998457, + 86712597643378, -16835133643735, -94759096142232, + 30708340810940, 79086853262082, 78923696440892, + -76316597208589, 37247268714759, 80303592631774, + 57790350050889, 19387319851064, -33186875066145, + 69701203023404, -7157433049060, -7073790423437, + 92769171617714, -75127120182184, -951893180618, + 64927618310150, -53875897154023, -16168039035569, + -24273449166429, -30359781249192, 35639397345991, + 45844829680593, 71401416837149, 0, + -99999999999999, 99999999999999}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), col1_data.size()); cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5}); + col1_data.begin(), col1_data.end(), validity, numeric::scale_type{-5}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); cudf::io::parquet_reader_options read_strict_opts = read_opts; @@ -786,7 +790,7 @@ TEST_F(ParquetReaderTest, DecimalRead) // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4) // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6) // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9) - unsigned char const fixed_len_bytes_decimal_parquet[] = { + std::array const fixed_len_bytes_decimal_parquet{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, 0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00, @@ -875,75 +879,72 @@ TEST_F(ParquetReaderTest, DecimalRead) cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{ - reinterpret_cast(fixed_len_bytes_decimal_parquet), parquet_len}); + reinterpret_cast(fixed_len_bytes_decimal_parquet.data()), parquet_len}); auto result = cudf::io::read_parquet(read_opts); EXPECT_EQ(result.tbl->view().num_columns(), 3); - auto validity_c0 = cudf::test::iterators::nulls_at({19}); - int32_t col0_data[] = {6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, - 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, - 2282579, 7521455, 4430706, 1937859, 4532040, 0}; + auto validity_c0 = cudf::test::iterators::nulls_at({19}); + std::array col0_data{6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, + 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, + 2282579, 7521455, 4430706, 1937859, 4532040, 0}; - EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), - sizeof(col0_data) / sizeof(col0_data[0])); + EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), col0_data.size()); cudf::test::fixed_point_column_wrapper col0( - std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3}); + col0_data.begin(), col0_data.end(), validity_c0, numeric::scale_type{-3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - auto validity_c1 = cudf::test::iterators::nulls_at({18}); - int64_t col1_data[] = {361378026250, - 30646804862, - 429930238629, - 418758703536, - 895494171113, - 435283865083, - 809096053722, - -999999999999, - 426465099333, - 526684574144, - 826310892810, - 584686967589, - 113822282951, - 409236212092, - 420631167535, - 918438386086, - -999999999999, - 489053889147, - 0, - 363993164092}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); + auto validity_c1 = cudf::test::iterators::nulls_at({18}); + std::array col1_data{361378026250, + 30646804862, + 429930238629, + 418758703536, + 895494171113, + 435283865083, + 809096053722, + -999999999999, + 426465099333, + 526684574144, + 826310892810, + 584686967589, + 113822282951, + 409236212092, + 420631167535, + 918438386086, + -999999999999, + 489053889147, + 0, + 363993164092}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), col1_data.size()); cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11}); + col1_data.begin(), col1_data.end(), validity_c1, numeric::scale_type{-11}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); - auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); - __int128_t col2_data[] = {9078697037144433659, - 9050770539577117612, - 2358363961733893636, - 1566059559232276662, - 6658306200002735268, - 4967909073046397334, - 0, - 7235588493887532473, - 5023160741463849572, - 2765173712965988273, - 3880866513515749646, - 5019704400576359500, - 5544435986818825655, - 7265381725809874549, - 0, - 1576192427381240677, - 2828305195087094598, - 260308667809395171, - 2460080200895288476, - 2718441925197820439}; - - EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), - sizeof(col2_data) / sizeof(col2_data[0])); + auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); + std::array<__int128_t, 20> col2_data{9078697037144433659, + 9050770539577117612, + 2358363961733893636, + 1566059559232276662, + 6658306200002735268, + 4967909073046397334, + 0, + 7235588493887532473, + 5023160741463849572, + 2765173712965988273, + 3880866513515749646, + 5019704400576359500, + 5544435986818825655, + 7265381725809874549, + 0, + 1576192427381240677, + 2828305195087094598, + 260308667809395171, + 2460080200895288476, + 2718441925197820439}; + + EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), col2_data.size()); cudf::test::fixed_point_column_wrapper<__int128_t> col2( - std::begin(col2_data), std::end(col2_data), validity_c2, numeric::scale_type{-1}); + col2_data.begin(), col2_data.end(), validity_c2, numeric::scale_type{-1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), col2); } } @@ -1190,15 +1191,12 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) cudf::test::fixed_width_column_wrapper values(value_iter, value_iter + num_values, validity); // ~256k values with num_nesting_levels = 16 - int total_values_produced = num_values; - auto prev_col = values.release(); + auto prev_col = values.release(); for (int idx = 0; idx < num_nesting_levels; idx++) { - auto const depth = num_nesting_levels - idx; auto const num_rows = (1 << (num_nesting_levels - idx)); auto offsets_iter = cudf::detail::make_counting_transform_iterator( - 0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; }); - total_values_produced += (num_rows + 1); + 0, [](cudf::size_type i) { return i * rows_per_level; }); cudf::test::fixed_width_column_wrapper offsets(offsets_iter, offsets_iter + num_rows + 1); @@ -1221,7 +1219,7 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) TEST_F(ParquetReaderTest, SingleLevelLists) { - unsigned char list_bytes[] = { + std::array list_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, @@ -1239,7 +1237,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists) // read single level list reproducing parquet file cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)}); + cudf::io::source_info{reinterpret_cast(list_bytes.data()), list_bytes.size()}); auto table = cudf::io::read_parquet(read_opts); auto const c0 = table.tbl->get_column(0); @@ -1252,7 +1250,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists) TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) { - unsigned char list_bytes[] = { + std::array list_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, @@ -1271,7 +1269,7 @@ TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) auto reader = cudf::io::chunked_parquet_reader( 1L << 31, cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)})); + cudf::io::source_info{reinterpret_cast(list_bytes.data()), list_bytes.size()})); int iterations = 0; while (reader.has_next() && iterations < 10) { auto chunk = reader.read_chunk(); @@ -1932,7 +1930,7 @@ TEST_F(ParquetReaderTest, FilterFloatNAN) TEST_F(ParquetReaderTest, RepeatedNoAnnotations) { - constexpr unsigned char repeated_bytes[] = { + constexpr std::array repeated_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, @@ -1976,9 +1974,9 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations) 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - auto read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); - auto result = cudf::io::read_parquet(read_opts); + auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(repeated_bytes.data()), repeated_bytes.size()}); + auto result = cudf::io::read_parquet(read_opts); EXPECT_EQ(result.tbl->view().column(0).size(), 6); EXPECT_EQ(result.tbl->view().num_columns(), 2); @@ -2728,3 +2726,40 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) EXPECT_EQ(result_table.num_columns(), expected->num_columns()); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table); } + +// The test below requires several minutes to complete with memcheck, thus it is disabled by +// default. +TEST_F(ParquetReaderTest, DISABLED_ListsWideTable) +{ + auto constexpr num_rows = 2; + auto constexpr num_cols = 26'755; // for slightly over 2B keys + auto constexpr seed = 0xceed; + + std::mt19937 engine{seed}; + + auto list_list = make_parquet_list_list_col(0, num_rows, 1, 1, false); + auto list_list_nulls = make_parquet_list_list_col(0, num_rows, 1, 1, true); + + // switch between nullable and non-nullable + std::vector cols(num_cols); + bool with_nulls = false; + std::generate_n(cols.begin(), num_cols, [&]() { + auto const view = with_nulls ? list_list_nulls->view() : list_list->view(); + with_nulls = not with_nulls; + return view; + }); + + cudf::table_view expected(cols); + + // Use a host buffer for faster I/O + std::vector buffer; + auto const out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected).build(); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size())); + auto const [result, _] = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view()); +} diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index be2ecd56424..5c3c8342cd2 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include // NOTE: this file exists to define the parquet test's `main()` function. diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index 9e66fc9409f..a0b48f54854 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -23,6 +23,8 @@ #include +#include + using cudf::test::iterators::no_nulls; // Base test fixture for V2 header tests @@ -693,9 +695,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -715,9 +717,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); @@ -787,9 +789,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -819,9 +821,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows, valids); @@ -897,9 +899,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -914,9 +916,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col3 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); @@ -1034,7 +1036,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) // hard coded schema indices. // TODO find a way to do this without magic - size_t const colidxs[] = {1, 3, 4, 5, 8}; + constexpr std::array colidxs{1, 3, 4, 5, 8}; for (size_t r = 0; r < fmd.row_groups.size(); r++) { auto const& rg = fmd.row_groups[r]; for (size_t c = 0; c < rg.columns.size(); c++) { @@ -1129,7 +1131,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) // col1 will have num_ordered_rows / 2 nulls total // col2 will have num_ordered_rows / 3 nulls total // col3 will have num_ordered_rows / 4 nulls total - int const null_mods[] = {0, 2, 3, 4}; + constexpr std::array null_mods{0, 2, 3, 4}; for (auto const& rg : fmd.row_groups) { for (size_t c = 0; c < rg.columns.size(); c++) { @@ -1299,25 +1301,25 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); - int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 5, 11}; - std::vector const expected_def_hists[] = {{1, 1, 2, 3}, - {1, 3, 10}, - {1, 1, 2, 10}, - {1, 1, 2, 2, 8}, - {1, 1, 1, 1, 10}, - {1, 1, 1, 1, 2, 8}, - {1, 3, 9}, - {1, 3, 1, 8}, - {1, 0, 4, 1, 1, 4, 9}}; - std::vector const expected_rep_hists[] = {{4, 3}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 5}, - {4, 4, 5}, - {4, 6, 2, 8}}; + std::array expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11}; + std::vector> const expected_def_hists = {{1, 1, 2, 3}, + {1, 3, 10}, + {1, 1, 2, 10}, + {1, 1, 2, 2, 8}, + {1, 1, 1, 1, 10}, + {1, 1, 1, 1, 2, 8}, + {1, 3, 9}, + {1, 3, 1, 8}, + {1, 0, 4, 1, 1, 4, 9}}; + std::vector> const expected_rep_hists = {{4, 3}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 5}, + {4, 4, 5}, + {4, 6, 2, 8}}; auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet"); auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index c8100038942..6c5e9cdf07a 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -31,6 +31,7 @@ #include #include +#include #include using cudf::test::iterators::no_nulls; @@ -289,7 +290,8 @@ class custom_test_data_sink : public cudf::io::data_sink { CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); } - ~custom_test_data_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~custom_test_data_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { @@ -879,53 +881,52 @@ TEST_F(ParquetWriterTest, Decimal128Stats) TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) { - char const* coldata[] = { - // in-range 7 bit. should truncate to "yyyyyyyz" - "yyyyyyyyy", - // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's - // considered binary, not UTF-8. If UTF-8 it should not truncate. - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - // max binary. this should not truncate - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" - "ééééé", - // max 2-byte UTF8 (U+07FF). should not truncate - "߿߿߿߿߿", - // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" - "ࠀࠀࠀ", - // max 3-byte UTF8 (U+FFFF). should not truncate - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" - "𐀀𐀀𐀀", - // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, - // which is no longer valid unicode, but is still ok UTF-8??? - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - // max 4-byte UTF8 (U+1FFFFF). should not truncate - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + std::array coldata{// in-range 7 bit. should truncate to "yyyyyyyz" + "yyyyyyyyy", + // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's + // considered binary, not UTF-8. If UTF-8 it should not truncate. + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + // max binary. this should not truncate + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" + "ééééé", + // max 2-byte UTF8 (U+07FF). should not truncate + "߿߿߿߿߿", + // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" + "ࠀࠀࠀ", + // max 3-byte UTF8 (U+FFFF). should not truncate + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" + "𐀀𐀀𐀀", + // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, + // which is no longer valid unicode, but is still ok UTF-8??? + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + // max 4-byte UTF8 (U+1FFFFF). should not truncate + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; // NOTE: UTF8 min is initialized with 0xf7bfbfbf. Binary values larger // than that will not become minimum value (when written as UTF-8). - char const* truncated_min[] = {"yyyyyyyy", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - "\xf7\xbf\xbf\xbf", - "éééé", - "߿߿߿߿", - "ࠀࠀ", - "\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀀", - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - "\xf7\xbf\xbf\xbf"}; - - char const* truncated_max[] = {"yyyyyyyz", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - "éééê", - "߿߿߿߿߿", - "ࠀࠁ", - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀁", - "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + std::array truncated_min{"yyyyyyyy", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + "\xf7\xbf\xbf\xbf", + "éééé", + "߿߿߿߿", + "ࠀࠀ", + "\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀀", + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + "\xf7\xbf\xbf\xbf"}; + + std::array truncated_max{"yyyyyyyz", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + "éééê", + "߿߿߿߿߿", + "ࠀࠁ", + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀁", + "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; auto cols = [&]() { using string_wrapper = column_wrapper; @@ -981,13 +982,15 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) { - std::vector truncated_min[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, - {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + std::array, 3> truncated_min{ + {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, + {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}}; - std::vector truncated_max[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, - {0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + std::array, 3> truncated_max{ + {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, + {0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}}; cudf::test::lists_column_wrapper col0{ {0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}}; diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp index ebadd870091..c40d3bbd299 100644 --- a/cpp/tests/io/row_selection_test.cpp +++ b/cpp/tests/io/row_selection_test.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp index 6f46df20633..79ce908f3e0 100644 --- a/cpp/tests/io/text/data_chunk_source_test.cpp +++ b/cpp/tests/io/text/data_chunk_source_test.cpp @@ -15,14 +15,11 @@ */ #include -#include #include #include #include -#include - #include #include diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 408d54bd5ff..60244462e2c 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -19,16 +19,12 @@ #include #include #include -#include -#include #include -#include #include #include #include #include -#include #include using cudf::test::strings_column_wrapper; @@ -145,7 +141,7 @@ TEST_F(MultibyteSplitTest, LargeInput) for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) { host_input += "...:|"; - host_expected.emplace_back(std::string("...:|")); + host_expected.emplace_back("...:|"); } auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()}; diff --git a/cpp/tests/iterator/value_iterator.cpp b/cpp/tests/iterator/value_iterator.cpp index 22bc7475dbe..f7f7c0f2721 100644 --- a/cpp/tests/iterator/value_iterator.cpp +++ b/cpp/tests/iterator/value_iterator.cpp @@ -13,7 +13,6 @@ * the License. */ -#include #include CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp index 6f9dfd06730..c9bb691907a 100644 --- a/cpp/tests/jit/parse_ptx_function.cpp +++ b/cpp/tests/jit/parse_ptx_function.cpp @@ -16,7 +16,6 @@ #include "jit/parser.hpp" -#include #include #include diff --git a/cpp/tests/join/cross_join_tests.cpp b/cpp/tests/join/cross_join_tests.cpp index d87f5e54153..971913443e5 100644 --- a/cpp/tests/join/cross_join_tests.cpp +++ b/cpp/tests/join/cross_join_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index 93754091b3f..9070efa38fe 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -15,12 +15,8 @@ */ #include -#include #include -#include #include -#include -#include #include #include @@ -31,7 +27,6 @@ #include #include -#include #include template @@ -314,7 +309,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -362,7 +357,7 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -398,7 +393,7 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -423,7 +418,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}}; strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true}); @@ -468,7 +463,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; auto col0_gold_names_col = strcol_wrapper{ "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"}; diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 3431e941359..6a8a54c8465 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -20,17 +20,12 @@ #include #include #include -#include #include -#include #include #include -#include -#include #include #include -#include #include #include #include diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index 6c147c8a128..9041969bec7 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -778,6 +778,138 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality) {1}); } +TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {2, 7, 8}); +} + +TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge) +{ + using T1 = double; + + // Number of rows in each column + auto constexpr N = 10000; + + // Generate column data for left and right tables + auto const [left_col0, right_col0] = gen_random_nullable_repeated_columns(N, 200); + auto const [left_col1, right_col1] = gen_random_nullable_repeated_columns(N, 100); + + // Setup data and nulls for the left table + std::vector, std::vector>> lefts = { + {left_col0.first, left_col0.second}, {left_col1.first, left_col1.second}}; + std::vector> left_wrappers; + std::vector left_columns; + for (auto [data, valids] : lefts) { + left_wrappers.emplace_back( + cudf::test::fixed_width_column_wrapper(data.begin(), data.end(), valids.begin())); + left_columns.emplace_back(left_wrappers.back()); + }; + + // Setup data and nulls for the right table + std::vector, std::vector>> rights = { + {right_col0.first, right_col0.second}, {right_col1.first, right_col1.second}}; + std::vector> right_wrappers; + std::vector right_columns; + for (auto [data, valids] : rights) { + right_wrappers.emplace_back( + cudf::test::fixed_width_column_wrapper(data.begin(), data.end(), valids.begin())); + right_columns.emplace_back(left_wrappers.back()); + }; + + // Left and right table views. + auto const left_table = cudf::table_view{left_columns}; + auto const right_table = cudf::table_view{right_columns}; + + // Using the zeroth column for equality. + auto const left_equality = left_table.select({0}); + auto const right_equality = right_table.select({0}); + + // Column references for equality column. + auto const col_ref_left_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_zero_eq_right_zero = + cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); + + // Mixed semi join with zeroth column equality + { + // Expected left_semi_join result + auto const expected_mixed_semi_join = + cudf::conditional_left_semi_join(left_table, right_table, left_zero_eq_right_zero); + + // Actual mixed_left_semi_join result + auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality, + right_equality, + left_table, + right_table, + left_zero_eq_right_zero, + cudf::null_equality::UNEQUAL); + + // Copy data back to host for comparisons + auto expected_indices = cudf::detail::make_std_vector_async( + cudf::device_span(*expected_mixed_semi_join), cudf::get_default_stream()); + auto result_indices = cudf::detail::make_std_vector_sync( + cudf::device_span(*mixed_semi_join), cudf::get_default_stream()); + + // Sort the indices for 1-1 comparison + std::sort(expected_indices.begin(), expected_indices.end()); + std::sort(result_indices.begin(), result_indices.end()); + + // Expected and actual vectors must match. + EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size()); + EXPECT_TRUE( + std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin())); + } + + // Mixed semi join with zeroth column equality and first column GREATER conditional + { + // Column references for conditional column. + auto const col_ref_left_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); + auto left_one_gt_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + // Expected left_semi_join result + auto const expected_mixed_semi_join = cudf::conditional_left_semi_join( + left_table, + right_table, + cudf::ast::operation( + cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one)); + + // Actual left_semi_join result + auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality, + right_equality, + left_table, + right_table, + left_one_gt_right_one, + cudf::null_equality::UNEQUAL); + + // Copy data back to host for comparisons + auto expected_indices = cudf::detail::make_std_vector_async( + cudf::device_span(*expected_mixed_semi_join), cudf::get_default_stream()); + auto result_indices = cudf::detail::make_std_vector_sync( + cudf::device_span(*mixed_semi_join), cudf::get_default_stream()); + + // Sort the indices for 1-1 comparison + std::sort(expected_indices.begin(), expected_indices.end()); + std::sort(result_indices.begin(), result_indices.end()); + + // Expected and actual vectors must match. + EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size()); + EXPECT_TRUE( + std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin())); + } +} + TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates) { this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}}, @@ -900,3 +1032,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality) left_zero_eq_right_zero, {0, 1, 3}); } + +TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {0, 1, 3, 4, 5, 6, 9}); +} diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp index 554d5754e39..ddc65c3f379 100644 --- a/cpp/tests/join/semi_anti_join_tests.cpp +++ b/cpp/tests/join/semi_anti_join_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index a9186874e83..53166e04173 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -652,7 +651,7 @@ TEST_F(JsonPathTests, MixedOutput) // various queries on: // clang-format off std::vector input_strings { - "{\"a\": {\"b\" : \"c\"}}", + R"({"a": {"b" : "c"}})", "{" "\"a\": {\"b\" : \"c\"}," @@ -827,7 +826,7 @@ TEST_F(JsonPathTests, AllowSingleQuotes) // various queries on: std::vector input_strings{ // clang-format off - "{\'a\': {\'b\' : \'c\'}}", + R"({'a': {'b' : 'c'}})", "{" "\'a\': {\'b\' : \"c\"}," @@ -902,7 +901,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::vector input_strings{ // clang-format off - "{\"item\" : [{\"key\" : \"value[\"}]}", + R"({"item" : [{"key" : "value["}]})", // clang-format on }; @@ -927,7 +926,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::vector input_strings{ // clang-format off - "{\"a\" : \"[}{}][][{[\\\"}}[\\\"]\"}", + R"({"a" : "[}{}][][{[\"}}[\"]"})", // clang-format on }; @@ -958,8 +957,8 @@ TEST_F(JsonPathTests, EscapeSequences) std::vector input_strings{ // clang-format off - "{\"a\" : \"\\\" \\\\ \\/ \\b \\f \\n \\r \\t\"}", - "{\"a\" : \"\\u1248 \\uacdf \\uACDF \\u10EF\"}" + R"({"a" : "\" \\ \/ \b \f \n \r \t"})", + R"({"a" : "\u1248 \uacdf \uACDF \u10EF"})" // clang-format on }; diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu index 80bde168b75..0703fa72f67 100644 --- a/cpp/tests/large_strings/json_tests.cu +++ b/cpp/tests/large_strings/json_tests.cu @@ -15,6 +15,7 @@ */ #include "../io/json/json_utils.cuh" +#include "io/comp/comp.hpp" #include "large_strings_fixture.hpp" #include @@ -25,10 +26,19 @@ #include #include -struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {}; +struct JsonLargeReaderTest : public cudf::test::StringsLargeTest, + public testing::WithParamInterface {}; -TEST_F(JsonLargeReaderTest, MultiBatch) +// Parametrize qualifying JSON tests for multiple compression types +INSTANTIATE_TEST_SUITE_P(JsonLargeReaderTest, + JsonLargeReaderTest, + ::testing::Values(cudf::io::compression_type::GZIP, + cudf::io::compression_type::NONE)); + +TEST_P(JsonLargeReaderTest, MultiBatch) { + cudf::io::compression_type const comptype = GetParam(); + std::string json_string = R"( { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } @@ -48,11 +58,26 @@ TEST_F(JsonLargeReaderTest, MultiBatch) json_string += json_string; } + std::vector cdata; + if (comptype != cudf::io::compression_type::NONE) { + cdata = cudf::io::detail::compress( + comptype, + cudf::host_span(reinterpret_cast(json_string.data()), + json_string.size()), + cudf::get_default_stream()); + } else + cdata = std::vector( + reinterpret_cast(json_string.data()), + reinterpret_cast(json_string.data()) + json_string.size()); + constexpr int num_sources = 2; std::vector> hostbufs( num_sources, cudf::host_span(reinterpret_cast(json_string.data()), json_string.size())); + std::vector> chostbufs( + num_sources, + cudf::host_span(reinterpret_cast(cdata.data()), cdata.size())); // Initialize parsing options (reading json lines) cudf::io::json_reader_options json_lines_options = @@ -62,14 +87,20 @@ TEST_F(JsonLargeReaderTest, MultiBatch) .lines(true) .compression(cudf::io::compression_type::NONE) .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); + cudf::io::json_reader_options cjson_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{ + cudf::host_span>(chostbufs.data(), chostbufs.size())}) + .lines(true) + .compression(comptype) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); // Read full test data via existing, nested JSON lines reader - cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(cjson_lines_options); + + auto datasources = cudf::io::datasource::create(json_lines_options.get_source().host_buffers()); + auto cdatasources = cudf::io::datasource::create(cjson_lines_options.get_source().host_buffers()); - std::vector> datasources; - for (auto& hb : hostbufs) { - datasources.emplace_back(cudf::io::datasource::create(hb)); - } // Test for different chunk sizes std::vector chunk_sizes{batch_size_upper_bound / 4, batch_size_upper_bound / 2, @@ -79,7 +110,9 @@ TEST_F(JsonLargeReaderTest, MultiBatch) for (auto chunk_size : chunk_sizes) { auto const tables = split_byte_range_reading(datasources, + cdatasources, json_lines_options, + cjson_lines_options, chunk_size, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); @@ -96,5 +129,5 @@ TEST_F(JsonLargeReaderTest, MultiBatch) } // go back to normal batch_size - unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); + unsetenv("LIBCUDF_JSON_BATCH_SIZE"); } diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp index 249319da7f7..f1404990354 100644 --- a/cpp/tests/large_strings/large_strings_fixture.cpp +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -16,12 +16,10 @@ #include "large_strings_fixture.hpp" -#include #include #include #include -#include #include #include @@ -123,12 +121,9 @@ LargeStringsData* StringsLargeTest::g_ls_data = nullptr; int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - auto const cmd_opts = parse_cudf_test_opts(argc, argv); - // hardcoding the CUDA memory resource to keep from exceeding the pool - auto mr = cudf::test::make_cuda(); - cudf::set_current_device_resource(mr.get()); - auto adaptor = make_stream_mode_adaptor(cmd_opts); - + cudf::test::config config; + config.rmm_mode = "cuda"; + init_cudf_test(argc, argv, config); // create object to automatically be destroyed at the end of main() auto lsd = cudf::test::StringsLargeTest::get_ls_data(); diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp index 007c08ce0fb..39cd783de00 100644 --- a/cpp/tests/large_strings/parquet_tests.cpp +++ b/cpp/tests/large_strings/parquet_tests.cpp @@ -16,10 +16,9 @@ #include "large_strings_fixture.hpp" -#include -#include #include +#include #include #include #include @@ -71,3 +70,76 @@ TEST_F(ParquetStringsTest, ReadLargeStrings) // go back to normal threshold unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); } + +// Disabled as the test is too brittle and depends on empirically set `pass_read_limit`, +// encoding type, and the currently used `ZSTD` scratch space size. +TEST_F(ParquetStringsTest, DISABLED_ChunkedReadLargeStrings) +{ + // Construct a table with one large strings column > 2GB + auto const wide = this->wide_column(); + auto input = cudf::concatenate(std::vector(120000, wide)); ///< 230MB + + int constexpr multiplier = 12; + std::vector input_cols(multiplier, input->view()); + auto col0 = cudf::concatenate(input_cols); ///< 2.70GB + + // Expected table + auto const expected = cudf::table_view{{col0->view()}}; + auto expected_metadata = cudf::io::table_input_metadata{expected}; + + // Needed to get exactly 2 Parquet subpasses: first with large-strings and the second with + // regualar ones. This may change in the future and lead to false failures. + expected_metadata.column_metadata[0].set_encoding( + cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY); + + // Host buffer to write Parquet + std::vector buffer; + + // Writer options + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected) + .metadata(expected_metadata); + + // Needed to get exactly 2 Parquet subpasses: first with large-strings and the second with + // regualar ones. This may change in the future and lead to false failures. + out_opts.set_compression(cudf::io::compression_type::ZSTD); + + // Write to Parquet + cudf::io::write_parquet(out_opts); + + // Empirically set pass_read_limit of 8GB so we read almost entire table (>2GB strings) in the + // first subpass and only a small amount in the second subpass. This may change in the future + // and lead to false failures. + size_t constexpr pass_read_limit = size_t{8} * 1024 * 1024 * 1024; + + // Reader options + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size())); + + // Chunked parquet reader + auto reader = cudf::io::chunked_parquet_reader(0, pass_read_limit, default_in_opts); + + // Read chunked + auto tables = std::vector>{}; + while (reader.has_next()) { + tables.emplace_back(reader.read_chunk().tbl); + } + auto table_views = std::vector{}; + std::transform(tables.begin(), tables.end(), std::back_inserter(table_views), [](auto& tbl) { + return tbl->view(); + }); + auto result = cudf::concatenate(table_views); + auto const result_view = result->view(); + + // Verify offsets + for (auto const& cv : result_view) { + auto const offsets = cudf::strings_column_view(cv).offsets(); + EXPECT_EQ(offsets.type(), cudf::data_type{cudf::type_id::INT64}); + } + + // Verify tables to be equal + CUDF_TEST_EXPECT_TABLES_EQUAL(result_view, expected); + + // Verify that we read exactly two table chunks + EXPECT_EQ(tables.size(), 2); +} diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp index 8fb2b403051..7ae7a6a7414 100644 --- a/cpp/tests/lists/contains_tests.cpp +++ b/cpp/tests/lists/contains_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp index 92dd5df5ec7..2c24f695c29 100644 --- a/cpp/tests/lists/extract_tests.cpp +++ b/cpp/tests/lists/extract_tests.cpp @@ -21,12 +21,8 @@ #include #include -#include -#include #include -#include - #include #include #include diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp index 74545903eb3..dcb906cd2ef 100644 --- a/cpp/tests/lists/sequences_tests.cpp +++ b/cpp/tests/lists/sequences_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp index 5625b47e7ea..18aa118bb81 100644 --- a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp @@ -20,8 +20,6 @@ #include #include -#include -#include #include namespace cudf::test { diff --git a/cpp/tests/merge/merge_dictionary_test.cpp b/cpp/tests/merge/merge_dictionary_test.cpp index dd528c19e4e..1d7a31fd797 100644 --- a/cpp/tests/merge/merge_dictionary_test.cpp +++ b/cpp/tests/merge/merge_dictionary_test.cpp @@ -17,9 +17,7 @@ #include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index 97979e79010..d9fdb6099f0 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -17,10 +17,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -30,10 +28,6 @@ #include -#include -#include -#include -#include #include #include @@ -97,7 +91,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyColumns) "hi", "hj"}); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else @@ -296,7 +290,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns) true, false, false}); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index 2e09f25b51f..fad390105d7 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -34,7 +33,6 @@ #include #include -#include #include @@ -349,7 +347,7 @@ TYPED_TEST(MergeTest_, Merge1KeyColumns) cudf::test::fixed_width_column_wrapper expectedDataWrap1(seq_out1, seq_out1 + outputRows); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else @@ -452,7 +450,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns) cudf::size_type inputRows = 40; // data: 0 2 4 6 | valid: 1 1 1 0 - auto sequence1 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) { + auto sequence1 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) { return 0; // <- no shortcut to this can avoid compiler errors } else { @@ -465,7 +463,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns) leftColWrap1(sequence1, sequence1 + inputRows, valid_sequence1); // data: 1 3 5 7 | valid: 1 1 1 0 - auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) { + auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) { return 1; } else diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp index 89d23c39dca..3693cfbcc72 100644 --- a/cpp/tests/partitioning/round_robin_test.cpp +++ b/cpp/tests/partitioning/round_robin_test.cpp @@ -17,10 +17,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -30,12 +28,7 @@ #include -#include -#include -#include -#include #include -#include #include using cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 915717713df..37414eb3fba 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {}; TEST_F(PercentileApproxTest, EmptyInput) { - auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; std::vector input; diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp index 6e88365b6e8..23b58618fe1 100644 --- a/cpp/tests/quantiles/quantile_test.cpp +++ b/cpp/tests/quantiles/quantile_test.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp index 44d4ec61852..c7e11af8c85 100644 --- a/cpp/tests/quantiles/quantiles_test.cpp +++ b/cpp/tests/quantiles/quantiles_test.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/reductions/ewm_tests.cpp b/cpp/tests/reductions/ewm_tests.cpp index 09cec688509..1117b0d1acf 100644 --- a/cpp/tests/reductions/ewm_tests.cpp +++ b/cpp/tests/reductions/ewm_tests.cpp @@ -18,9 +18,7 @@ #include #include -#include -#include #include template diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp index f5470f7d881..cb412f1e925 100644 --- a/cpp/tests/reductions/list_rank_test.cpp +++ b/cpp/tests/reductions/list_rank_test.cpp @@ -14,14 +14,9 @@ * limitations under the License. */ -#include - #include #include -#include -#include -#include #include struct ListRankScanTest : public cudf::test::BaseFixture { @@ -136,7 +131,7 @@ TEST_F(ListRankScanTest, ListOfStruct) false, false}}; auto col2 = cudf::test::strings_column_wrapper{ - {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"}, + {"x", "x", "a", "a", "b", "", "a", "b", "a", "b", "a", "c", "a", "c", "", "", "b", "b"}, {true, true, true, diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp index 3ab1fc01eaa..130458548fc 100644 --- a/cpp/tests/reductions/rank_tests.cpp +++ b/cpp/tests/reductions/rank_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include #include #include @@ -126,7 +125,7 @@ auto make_input_column() { if constexpr (std::is_same_v) { return cudf::test::strings_column_wrapper{ - {"0", "0", "4", "4", "4", "5", "7", "7", "7", "9", "9", "9"}, + {"0", "0", "4", "4", "4", "", "7", "7", "7", "9", "9", "9"}, cudf::test::iterators::null_at(5)}; } else { using fw_wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 949ffcc26a6..67083f19b3a 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -22,9 +22,7 @@ #include #include -#include #include -#include #include #include #include @@ -35,7 +33,8 @@ #include -#include +#include +#include #include using aggregation = cudf::aggregation; @@ -766,6 +765,25 @@ TYPED_TEST(MultiStepReductionTest, Mean) expected_value_nulls); } +template +double calc_var(std::vector const& v, int ddof, std::vector const& mask = {}) +{ + auto const values = [&]() { + if (mask.empty()) { return v; } + std::vector masked{}; + thrust::copy_if( + v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; }); + return masked; + }(); + auto const valid_count = values.size(); + double const mean = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count; + double const sq_sum_of_differences = + std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) { + return acc + std::pow(v - mean, 2); + }); + return sq_sum_of_differences / (valid_count - ddof); +} + // This test is disabled for only a Debug build because a compiler error // documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu #ifdef NDEBUG @@ -778,25 +796,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [](std::vector& v, cudf::size_type valid_count, int ddof) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls std::vector v = convert_values(int_values); cudf::test::fixed_width_column_wrapper col(v.begin(), v.end()); auto const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -812,23 +817,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(v, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(v, host_bools, T{0}); - - double var_nulls = calc_var(replaced_array, valid_count, ddof); - double std_nulls = std::sqrt(var_nulls); + double var_nulls = calc_var(v, ddof, host_bools); + double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } // ---------------------------------------------------------------------------- @@ -1140,23 +1141,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [ddof](std::vector& v, cudf::size_type valid_count) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::fixed_width_column_wrapper col(int_values.begin(), int_values.end()); - double var = calc_var(int_values, int_values.size()); + double var = calc_var(int_values, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -1173,23 +1161,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(int_values, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(int_values, host_bools, int{0}); - - double var_nulls = calc_var(replaced_array, valid_count); + double var_nulls = calc_var(int_values, ddof, host_bools); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } //------------------------------------------------------------------- @@ -1254,7 +1238,7 @@ struct StringReductionTest : public cudf::test::BaseFixture, }; // ------------------------------------------------------------------------ -std::vector string_list[] = { +std::vector> string_list{{ {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}, {"", "two", "three", "four", "five", "six", "seven", "eight", "nine"}, {"one", "", "three", "four", "five", "six", "seven", "eight", "nine"}, @@ -1264,13 +1248,19 @@ std::vector string_list[] = { {"\xF7\xBF\xBF\xBF", "", "", "", "", "", "", "", ""}, {"one", "two", "three", "four", "\xF7\xBF\xBF\xBF", "six", "seven", "eight", "nine"}, {"one", "two", "\xF7\xBF\xBF\xBF", "four", "five", "six", "seven", "eight", "nine"}, -}; +}}; INSTANTIATE_TEST_CASE_P(string_cases, StringReductionTest, testing::ValuesIn(string_list)); TEST_P(StringReductionTest, MinMax) { // data and valid arrays std::vector host_strings(GetParam()); std::vector host_bools({true, false, true, true, true, true, false, false, true}); + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(host_strings.size()), + host_strings.begin(), + [host_strings, host_bools](auto idx) { + return host_bools[idx] ? host_strings[idx] : std::string{}; + }); bool succeed(true); std::string initial_value = "init"; @@ -1397,7 +1387,7 @@ TEST_F(StringReductionTest, AllNull) std::vector host_strings( {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}); std::vector host_bools(host_strings.size(), false); - auto initial_value = cudf::make_string_scalar("init"); + auto initial_value = cudf::make_string_scalar(""); initial_value->set_valid_async(false); // string column with nulls @@ -2235,7 +2225,7 @@ TYPED_TEST(ReductionTest, NthElement) struct DictionaryStringReductionTest : public StringReductionTest {}; -std::vector data_list[] = { +std::vector> data_list = { {"nine", "two", "five", "three", "five", "six", "two", "eight", "nine"}, }; INSTANTIATE_TEST_CASE_P(dictionary_cases, @@ -2472,21 +2462,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector v = convert_values(int_values); cudf::data_type output_type{cudf::type_to_id()}; - auto calc_var = [](std::vector const& v, cudf::size_type valid_count, cudf::size_type ddof) { - double mean = std::accumulate(v.cbegin(), v.cend(), double{0}); - mean /= valid_count; - double sum_of_sq = std::accumulate( - v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - auto const div = valid_count - ddof; - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::dictionary_column_wrapper col(v.begin(), v.end()); cudf::size_type const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -2498,15 +2478,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector validity({true, true, false, true, true, true, false, true}); cudf::test::dictionary_column_wrapper col_nulls(v.begin(), v.end(), validity.begin()); - cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true); - - double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof); + double var_nulls = calc_var(v, ddof, validity); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, - var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, - std_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, + var_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, + std_nulls); } TYPED_TEST(DictionaryReductionTest, NthElement) @@ -3110,21 +3088,28 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls) using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; - // `null` means null at child column. - // `NULL` means null at parent column. auto const input = [] { auto child1 = STRINGS_CW{{"año", "bit", - "₹1" /*null*/, - "aaa" /*NULL*/, + "", // child null + "aaa", // parent null "zit", "bat", "aab", - "$1" /*null*/, - "€1" /*NULL*/, + "", // child null + "€1", // parent null "wut"}, nulls_at({2, 7})}; - auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10}, + auto child2 = INTS_CW{{1, + 2, + 0, // child null + 4, // parent null + 5, + 6, + 7, + 0, // child null + 9, // parent NULL + 10}, nulls_at({2, 7})}; return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})}; }(); diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index 76dbbaef491..5f911597b02 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -20,13 +20,11 @@ #include #include -#include #include #include #include #include -#include #include #include @@ -414,12 +412,13 @@ TEST_F(ScanStringsTest, MoreStringsMinMax) { int row_count = 512; - auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) { - char const s[] = {static_cast('a' + (idx % 26)), 0}; - return std::string(s); - }); - auto validity = cudf::detail::make_counting_transform_iterator( + auto validity = cudf::detail::make_counting_transform_iterator( 0, [](auto idx) -> bool { return (idx % 23) != 22; }); + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [validity](auto idx) { + if (validity[idx] == 0) return std::string{}; + char const s = static_cast('a' + (idx % 26)); + return std::string{1, s}; + }); cudf::test::strings_column_wrapper col(data_begin, data_begin + row_count, validity); thrust::host_vector v(data_begin, data_begin + row_count); @@ -622,21 +621,28 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls) using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; - // `null` means null at child column. - // `NULL` means null at parent column. auto const input = [] { auto child1 = STRINGS_CW{{"año", "bit", - "₹1" /*null*/, - "aaa" /*NULL*/, + "", // child null + "aaa", // parent null "zit", "bat", "aab", - "$1" /*null*/, - "€1" /*NULL*/, + "", // child null + "€1", // parent null "wut"}, nulls_at({2, 7})}; - auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10}, + auto child2 = INTS_CW{{1, + 2, + 0, // child null + 4, // parent null + 5, + 6, + 7, + 0, // child null + 9, // parent null + 10}, nulls_at({2, 7})}; return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})}; }(); @@ -694,25 +700,25 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls) auto const expected = [] { auto child1 = STRINGS_CW{{"año", "año", - "" /*null*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/}, + "", // child null + "", // parent null + "", // parent null + "", // parent null + "", // parent null + "", // parent null + "", // parent null + ""}, // parent null null_at(2)}; auto child2 = INTS_CW{{1, 1, - 0 /*null*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/}, + 0, // child null + 0, // parent null + 0, // parent null + 0, // parent null + 0, // parent null + 0, // parent null + 0, // parent null + 0}, // parent null null_at(2)}; return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})}; }(); diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp index 858697d8ef5..c2cce4bbbfa 100644 --- a/cpp/tests/reductions/scan_tests.hpp +++ b/cpp/tests/reductions/scan_tests.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,9 +20,7 @@ #include #include -#include #include -#include #include #include @@ -30,7 +28,6 @@ #include #include -#include template struct TypeParam_to_host_type { diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp index 19996f827cf..bc0321bd40a 100644 --- a/cpp/tests/reductions/segmented_reduction_tests.cpp +++ b/cpp/tests/reductions/segmented_reduction_tests.cpp @@ -1092,11 +1092,10 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets) auto aggregates = std::vector>>(); - aggregates.push_back(std::move(cudf::make_max_aggregation())); - aggregates.push_back(std::move(cudf::make_min_aggregation())); - aggregates.push_back(std::move(cudf::make_sum_aggregation())); - aggregates.push_back( - std::move(cudf::make_product_aggregation())); + aggregates.push_back(cudf::make_max_aggregation()); + aggregates.push_back(cudf::make_min_aggregation()); + aggregates.push_back(cudf::make_sum_aggregation()); + aggregates.push_back(cudf::make_product_aggregation()); auto output_type = cudf::data_type{cudf::type_to_id()}; for (auto&& agg : aggregates) { diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp index 239c9ce6ddd..e972ea35ed0 100644 --- a/cpp/tests/replace/clamp_test.cpp +++ b/cpp/tests/replace/clamp_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/replace/normalize_replace_tests.cpp b/cpp/tests/replace/normalize_replace_tests.cpp index 2de17388ee8..c35f385329a 100644 --- a/cpp/tests/replace/normalize_replace_tests.cpp +++ b/cpp/tests/replace/normalize_replace_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include // This is the main test fixture diff --git a/cpp/tests/replace/replace_nans_tests.cpp b/cpp/tests/replace/replace_nans_tests.cpp index 35232204db7..1b9fe92066a 100644 --- a/cpp/tests/replace/replace_nans_tests.cpp +++ b/cpp/tests/replace/replace_nans_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp index fcee27305f2..0c8ccea52a6 100644 --- a/cpp/tests/replace/replace_nulls_tests.cpp +++ b/cpp/tests/replace/replace_nulls_tests.cpp @@ -20,13 +20,11 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp index 1858cd7782e..ae4041bcfaf 100644 --- a/cpp/tests/replace/replace_tests.cpp +++ b/cpp/tests/replace/replace_tests.cpp @@ -20,20 +20,16 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include #include #include -#include #include @@ -356,7 +352,7 @@ void test_replace(cudf::host_span input_column, for (size_t i = 0; i < values_to_replace_column.size(); i++) { size_t k = 0; - auto pred = [=, &k, &reference_result, &expected_valid, &isReplaced](T element) { + auto pred = [=, &k, &expected_valid, &isReplaced](T element) { bool toBeReplaced = false; if (!isReplaced[k]) { if (!input_has_nulls || expected_valid[k]) { @@ -503,7 +499,7 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest) const size_t REPLACE_SIZE = 10000; thrust::host_vector input_column(DATA_SIZE); - std::generate(std::begin(input_column), std::end(input_column), [REPLACE_SIZE]() { + std::generate(std::begin(input_column), std::end(input_column), []() { return std::rand() % (REPLACE_SIZE); }); diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp index b3d9b2e2f5f..59585c0e947 100644 --- a/cpp/tests/reshape/byte_cast_tests.cpp +++ b/cpp/tests/reshape/byte_cast_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp index ed76b9d2ea5..25cfc5c5108 100644 --- a/cpp/tests/reshape/tile_tests.cpp +++ b/cpp/tests/reshape/tile_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp index f702dc78371..e8a36d9ab48 100644 --- a/cpp/tests/rolling/collect_ops_test.cpp +++ b/cpp/tests/rolling/collect_ops_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -214,7 +213,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) *cudf::make_collect_list_aggregation()); auto expected_result_2 = cudf::test::lists_column_wrapper{ {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i < 4; })}.release(); @@ -338,7 +337,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) cudf::test::fixed_width_column_wrapper{0, 0, 4, 8, 12, 12, 12}.release(); auto expected_num_rows = expected_offsets->size() - 1; auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows); @@ -373,7 +372,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) cudf::test::fixed_width_column_wrapper{0, 0, 3, 5, 8, 8, 8}.release(); auto expected_num_rows = expected_offsets->size() - 1; auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows); @@ -1499,7 +1498,7 @@ TYPED_TEST(TypedCollectSetTest, RollingWindowHonoursMinPeriods) *cudf::make_collect_set_aggregation()); auto expected_result_2 = cudf::test::lists_column_wrapper{ {{}, {0, 1, 2}, {1, 2, 4}, {2, 4, 5}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i < 4; })}.release(); diff --git a/cpp/tests/rolling/empty_input_test.cpp b/cpp/tests/rolling/empty_input_test.cpp index e7d1e3f0b10..2e1815671a9 100644 --- a/cpp/tests/rolling/empty_input_test.cpp +++ b/cpp/tests/rolling/empty_input_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,7 @@ */ #include -#include #include -#include #include #include diff --git a/cpp/tests/rolling/grouped_rolling_range_test.cpp b/cpp/tests/rolling/grouped_rolling_range_test.cpp index fcfbd0eee78..2cb9b60000b 100644 --- a/cpp/tests/rolling/grouped_rolling_range_test.cpp +++ b/cpp/tests/rolling/grouped_rolling_range_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,21 +17,16 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include #include #include -#include -#include #include #include diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp index 78d5daf7e83..78b444bcd93 100644 --- a/cpp/tests/rolling/grouped_rolling_test.cpp +++ b/cpp/tests/rolling/grouped_rolling_test.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp index de057e96320..6519b0ed4ee 100644 --- a/cpp/tests/rolling/lead_lag_test.cpp +++ b/cpp/tests/rolling/lead_lag_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp index 9cc8b6dec81..5f2b383ed55 100644 --- a/cpp/tests/rolling/nth_element_test.cpp +++ b/cpp/tests/rolling/nth_element_test.cpp @@ -17,22 +17,15 @@ #include #include #include -#include #include #include #include -#include -#include #include -#include - #include #include -#include - #include #include @@ -83,7 +76,7 @@ class rolling_exec { return *this; } - std::unique_ptr test_grouped_nth_element( + [[nodiscard]] std::unique_ptr test_grouped_nth_element( cudf::size_type n, std::optional null_handling = std::nullopt) const { return cudf::grouped_rolling_window( @@ -96,7 +89,7 @@ class rolling_exec { n, null_handling.value_or(_null_handling))); } - std::unique_ptr test_nth_element( + [[nodiscard]] std::unique_ptr test_nth_element( cudf::size_type n, std::optional null_handling = std::nullopt) const { return cudf::rolling_window(_input, diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index ec726878b34..dcaa47e722b 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,14 +17,10 @@ #include #include #include -#include #include #include -#include -#include #include -#include template using fwcw = cudf::test::fixed_width_column_wrapper; @@ -41,6 +37,11 @@ using cudf::test::iterators::nulls_at; auto constexpr null = int32_t{0}; // NULL representation for int32_t; +// clang-tidy doesn't think std::transform can handle a +// thrust::constant_iterator, so this is a workaround that uses nulls_at +// instead of no_nulls +auto no_nulls_list() { return nulls_at({}); } + struct OffsetRowWindowTest : public cudf::test::BaseFixture { static ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; static ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -210,7 +211,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2) CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COLLECT_LIST), - lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, no_nulls}); + lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, + no_nulls_list()}); } TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) @@ -250,7 +252,7 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COLLECT_LIST), lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}}, - no_nulls}); + no_nulls_list()}); } // To test that preceding bounds are clamped correctly at group boundaries. diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp index 461c41025e9..daf5fcc1d96 100644 --- a/cpp/tests/rolling/range_rolling_window_test.cpp +++ b/cpp/tests/rolling/range_rolling_window_test.cpp @@ -17,22 +17,17 @@ #include #include #include -#include #include #include -#include #include #include -#include -#include #include #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp index b77451bf0bc..a67555280f4 100644 --- a/cpp/tests/rolling/range_window_bounds_test.cpp +++ b/cpp/tests/rolling/range_window_bounds_test.cpp @@ -15,9 +15,6 @@ */ #include -#include -#include -#include #include #include @@ -25,8 +22,6 @@ #include -#include - struct RangeWindowBoundsTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index c2c22986975..72a511fd5f1 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -30,7 +29,6 @@ #include #include #include -#include #include #include @@ -541,7 +539,7 @@ class RollingTest : public cudf::test::BaseFixture { agg_op op; for (cudf::size_type i = 0; i < num_rows; i++) { - OutputType val = agg_op::template identity(); + auto val = agg_op::template identity(); // load sizes min_periods = std::max(min_periods, 1); // at least one observation is required diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp index 5f132f3ace9..26987ea1b7b 100644 --- a/cpp/tests/scalar/factories_test.cpp +++ b/cpp/tests/scalar/factories_test.cpp @@ -22,11 +22,8 @@ #include #include -#include #include -#include - class ScalarFactoryTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp index 2d37de920d5..2b79911a95a 100644 --- a/cpp/tests/scalar/scalar_test.cpp +++ b/cpp/tests/scalar/scalar_test.cpp @@ -190,7 +190,7 @@ TEST_F(ListScalarTest, MoveConstructorNonNested) EXPECT_EQ(mask_ptr, s2.validity_data()); EXPECT_EQ(data_ptr, s2.view().data()); - EXPECT_EQ(s.view().data(), nullptr); + EXPECT_EQ(s.view().data(), nullptr); // NOLINT } TEST_F(ListScalarTest, MoveConstructorNested) @@ -205,8 +205,8 @@ TEST_F(ListScalarTest, MoveConstructorNested) EXPECT_EQ(mask_ptr, s2.validity_data()); EXPECT_EQ(offset_ptr, s2.view().child(0).data()); EXPECT_EQ(data_ptr, s2.view().child(1).data()); - EXPECT_EQ(s.view().data(), nullptr); - EXPECT_EQ(s.view().num_children(), 0); + EXPECT_EQ(s.view().data(), nullptr); // NOLINT + EXPECT_EQ(s.view().num_children(), 0); // NOLINT } struct StructScalarTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp index 78f79ccc648..a3bb1dfda10 100644 --- a/cpp/tests/search/search_dictionary_test.cpp +++ b/cpp/tests/search/search_dictionary_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp index 48711c21715..fb5d0fcc889 100644 --- a/cpp/tests/search/search_list_test.cpp +++ b/cpp/tests/search/search_list_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -35,7 +34,6 @@ using strings_col = cudf::test::strings_column_wrapper; constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; constexpr int32_t null{0}; // Mark for null child elements at the current level -constexpr int32_t XXX{0}; // Mark for null elements at all levels using TestTypes = cudf::test::Concat #include -#include #include #include #include diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp index 7550cc27161..8d750be5677 100644 --- a/cpp/tests/search/search_test.cpp +++ b/cpp/tests/search/search_test.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp index 109095192f9..e3c9f8d349e 100644 --- a/cpp/tests/sort/is_sorted_tests.cpp +++ b/cpp/tests/sort/is_sorted_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp index e08a2105aea..ded46cb1f31 100644 --- a/cpp/tests/sort/rank_test.cpp +++ b/cpp/tests/sort/rank_test.cpp @@ -18,10 +18,8 @@ #include #include #include -#include #include -#include #include #include #include diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp index 8ab23936ceb..ce4148a941e 100644 --- a/cpp/tests/sort/sort_nested_types_tests.cpp +++ b/cpp/tests/sort/sort_nested_types_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp index e84275f41ef..e1505c7a474 100644 --- a/cpp/tests/sort/sort_test.cpp +++ b/cpp/tests/sort/sort_test.cpp @@ -28,7 +28,6 @@ #include #include -#include #include #include @@ -1087,7 +1086,7 @@ TEST_F(SortCornerTest, WithEmptyStructColumn) child_columns2.push_back(std::move(child_col_1)); int_col col4{{5, 4, 3, 2, 1, 0}}; std::vector> grand_child; - grand_child.push_back(std::move(col4.release())); + grand_child.push_back(col4.release()); auto child_col_2 = cudf::make_structs_column(6, std::move(grand_child), 0, rmm::device_buffer{}); child_columns2.push_back(std::move(child_col_2)); auto struct_col3 = diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp index 655166e0d62..88de9d51523 100644 --- a/cpp/tests/sort/stable_sort_tests.cpp +++ b/cpp/tests/sort/stable_sort_tests.cpp @@ -25,9 +25,6 @@ #include #include -#include -#include - #include #include diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp index 6c0582fb846..1204b019739 100644 --- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp @@ -20,9 +20,7 @@ #include #include #include -#include -#include #include #include #include @@ -31,8 +29,6 @@ #include #include -#include -#include #include struct ApplyBooleanMask : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index a2dab649961..ee1bb3ead92 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -15,16 +15,11 @@ */ #include -#include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp index 14d7d8789ac..c618ff68cbb 100644 --- a/cpp/tests/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -27,8 +26,6 @@ #include #include -#include - auto constexpr null{0}; // null at current level auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); diff --git a/cpp/tests/stream_compaction/drop_nans_tests.cpp b/cpp/tests/stream_compaction/drop_nans_tests.cpp index bf72da5c840..71321361564 100644 --- a/cpp/tests/stream_compaction/drop_nans_tests.cpp +++ b/cpp/tests/stream_compaction/drop_nans_tests.cpp @@ -15,12 +15,9 @@ */ #include -#include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/stream_compaction/drop_nulls_tests.cpp b/cpp/tests/stream_compaction/drop_nulls_tests.cpp index dbac1d58195..d3b45c2323e 100644 --- a/cpp/tests/stream_compaction/drop_nulls_tests.cpp +++ b/cpp/tests/stream_compaction/drop_nulls_tests.cpp @@ -15,12 +15,10 @@ */ #include -#include #include #include #include -#include #include #include #include diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp index 6c6c53331d4..cc847da6340 100644 --- a/cpp/tests/stream_compaction/stable_distinct_tests.cpp +++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp @@ -15,20 +15,16 @@ */ #include -#include #include #include #include #include -#include #include #include #include #include -#include - auto constexpr null{0}; // null at current level auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp index 640d159fc4f..bad93e92712 100644 --- a/cpp/tests/stream_compaction/unique_count_tests.cpp +++ b/cpp/tests/stream_compaction/unique_count_tests.cpp @@ -15,16 +15,11 @@ */ #include -#include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp index 4d7d23dc881..e2b32b898b3 100644 --- a/cpp/tests/stream_compaction/unique_tests.cpp +++ b/cpp/tests/stream_compaction/unique_tests.cpp @@ -15,22 +15,16 @@ */ #include -#include #include #include #include -#include #include -#include #include #include #include #include -#include -#include - using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; @@ -43,7 +37,6 @@ auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; auto constexpr KEEP_LAST = cudf::duplicate_keep_option::KEEP_LAST; auto constexpr KEEP_NONE = cudf::duplicate_keep_option::KEEP_NONE; -auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; using int32s_col = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp index 2a7b52b1b6b..3dcc6f9e632 100644 --- a/cpp/tests/streams/binaryop_test.cpp +++ b/cpp/tests/streams/binaryop_test.cpp @@ -21,7 +21,6 @@ #include #include -#include #include class BinaryopTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp index 82629156fa6..29b302c3637 100644 --- a/cpp/tests/streams/datetime_test.cpp +++ b/cpp/tests/streams/datetime_test.cpp @@ -35,52 +35,62 @@ class DatetimeTest : public cudf::test::BaseFixture { TEST_F(DatetimeTest, ExtractYear) { - cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::YEAR, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractMonth) { - cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::MONTH, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractDay) { - cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::DAY, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractWeekday) { - cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::WEEKDAY, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractHour) { - cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::HOUR, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractMinute) { - cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::MINUTE, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractSecond) { - cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::SECOND, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractMillisecondFraction) { - cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::MILLISECOND, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractMicrosecondFraction) { - cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::MICROSECOND, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractNanosecondFraction) { - cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::NANOSECOND, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, LastDayOfMonth) diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp index 42894a0ebcb..a74ee64f8de 100644 --- a/cpp/tests/streams/io/csv_test.cpp +++ b/cpp/tests/streams/io/csv_test.cpp @@ -17,13 +17,9 @@ #include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp index f98e685ed0c..d352c6c3b2a 100644 --- a/cpp/tests/streams/io/json_test.cpp +++ b/cpp/tests/streams/io/json_test.cpp @@ -19,9 +19,7 @@ #include #include -#include #include -#include #include #include diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp index b0eff1d3340..5bb17226029 100644 --- a/cpp/tests/streams/io/multibyte_split_test.cpp +++ b/cpp/tests/streams/io/multibyte_split_test.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp index cc43bf15b5d..10722557e6a 100644 --- a/cpp/tests/streams/io/orc_test.cpp +++ b/cpp/tests/streams/io/orc_test.cpp @@ -17,19 +17,11 @@ #include #include #include -#include -#include #include #include -#include #include -#include -#include -#include -#include -#include #include #include diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp index 9d2dec2d697..18bb80e64af 100644 --- a/cpp/tests/streams/io/parquet_test.cpp +++ b/cpp/tests/streams/io/parquet_test.cpp @@ -17,13 +17,9 @@ #include #include #include -#include -#include #include #include -#include -#include #include #include diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp index 2811bb676fa..27bd7e080c9 100644 --- a/cpp/tests/streams/join_test.cpp +++ b/cpp/tests/streams/join_test.cpp @@ -19,11 +19,9 @@ #include #include -#include #include #include #include -#include #include #include diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp index e96224003f4..ed37a72545f 100644 --- a/cpp/tests/streams/null_mask_test.cpp +++ b/cpp/tests/streams/null_mask_test.cpp @@ -14,15 +14,12 @@ * limitations under the License. */ -#include - #include #include #include #include #include -#include class NullMaskTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/partitioning_test.cpp b/cpp/tests/streams/partitioning_test.cpp new file mode 100644 index 00000000000..636c5c1f1f9 --- /dev/null +++ b/cpp/tests/streams/partitioning_test.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using cudf::test::fixed_width_column_wrapper; +using cudf::test::strings_column_wrapper; + +class PartitionTest : public cudf::test::BaseFixture {}; + +TEST_F(PartitionTest, Struct) +{ + fixed_width_column_wrapper A({1, 2}, {0, 1}); + auto struct_col = cudf::test::structs_column_wrapper({A}, {0, 1}).release(); + auto table_to_partition = cudf::table_view{{*struct_col}}; + fixed_width_column_wrapper map{9, 2}; + + auto num_partitions = 12; + auto result = + cudf::partition(table_to_partition, map, num_partitions, cudf::test::get_default_stream()); +} + +TEST_F(PartitionTest, EmptyInput) +{ + auto const empty_column = fixed_width_column_wrapper{}; + auto const num_partitions = 5; + auto const start_partition = 0; + auto const [out_table, out_offsets] = + cudf::round_robin_partition(cudf::table_view{{empty_column}}, + num_partitions, + start_partition, + cudf::test::get_default_stream()); +} + +TEST_F(PartitionTest, ZeroPartitions) +{ + fixed_width_column_wrapper floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f}); + fixed_width_column_wrapper integers({1, 2, 3, 4, 5, 6, 7, 8}); + strings_column_wrapper strings({"a", "bb", "ccc", "d", "ee", "fff", "gg", "h"}); + auto input = cudf::table_view({floats, integers, strings}); + + auto columns_to_hash = std::vector({2}); + + cudf::size_type const num_partitions = 0; + auto [output, offsets] = cudf::hash_partition(input, + columns_to_hash, + num_partitions, + cudf::hash_id::HASH_MURMUR3, + cudf::DEFAULT_HASH_SEED, + cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/quantile_test.cpp b/cpp/tests/streams/quantile_test.cpp new file mode 100644 index 00000000000..4f4f16a9e70 --- /dev/null +++ b/cpp/tests/streams/quantile_test.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include + +struct QuantileTest : public cudf::test::BaseFixture {}; + +TEST_F(QuantileTest, TestMultiColumnUnsorted) +{ + auto input_a = cudf::test::strings_column_wrapper( + {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C", + "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"}, + {true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + cudf::test::fixed_width_column_wrapper input_b( + {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto input = cudf::table_view({input_a, input_b}); + + auto actual = cudf::quantiles(input, + {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, + cudf::interpolation::NEAREST, + cudf::sorted::NO, + {cudf::order::ASCENDING, cudf::order::DESCENDING}, + {}, + cudf::test::get_default_stream()); +} + +TEST_F(QuantileTest, TestEmpty) +{ + auto input = cudf::test::fixed_width_column_wrapper({}); + cudf::quantile( + input, {0.5, 0.25}, cudf::interpolation::LINEAR, {}, true, cudf::test::get_default_stream()); +} + +TEST_F(QuantileTest, EmptyInput) +{ + auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); + cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; + + std::vector input; + input.push_back(*empty_); + input.push_back(*empty_); + input.push_back(*empty_); + auto empty = cudf::concatenate(input, cudf::test::get_default_stream()); + + cudf::tdigest::tdigest_column_view tdv(*empty); + auto result = cudf::percentile_approx(tdv, percentiles, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp index b4f013fc960..9ab972302e4 100644 --- a/cpp/tests/streams/reduction_test.cpp +++ b/cpp/tests/streams/reduction_test.cpp @@ -17,11 +17,8 @@ #include #include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp index b352ad2c0d2..4d9899870b4 100644 --- a/cpp/tests/streams/rolling_test.cpp +++ b/cpp/tests/streams/rolling_test.cpp @@ -17,12 +17,10 @@ #include #include #include -#include #include #include #include -#include class RollingTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/round_test.cpp b/cpp/tests/streams/round_test.cpp new file mode 100644 index 00000000000..b8fda022db8 --- /dev/null +++ b/cpp/tests/streams/round_test.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class RoundTest : public cudf::test::BaseFixture {}; + +TEST_F(RoundTest, RoundHalfToEven) +{ + std::vector vals = {1.729, 17.29, 172.9, 1729}; + cudf::test::fixed_width_column_wrapper input(vals.begin(), vals.end()); + cudf::round(input, 0, cudf::rounding_method::HALF_UP, cudf::test::get_default_stream()); +} + +TEST_F(RoundTest, RoundHalfAwayFromEven) +{ + std::vector vals = {1.5, 2.5, 1.35, 1.45, 15, 25}; + cudf::test::fixed_width_column_wrapper input(vals.begin(), vals.end()); + cudf::round(input, -1, cudf::rounding_method::HALF_EVEN, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp index 443f4548b2c..e7b282601e1 100644 --- a/cpp/tests/streams/stream_compaction_test.cpp +++ b/cpp/tests/streams/stream_compaction_test.cpp @@ -15,22 +15,16 @@ */ #include -#include #include #include #include -#include #include #include #include #include #include -#include - -auto constexpr null{0}; // null at current level -auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp new file mode 100644 index 00000000000..449e0830b0c --- /dev/null +++ b/cpp/tests/streams/strings/factory_test.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +#include +#include + +class StringsFactoryTest : public cudf::test::BaseFixture {}; + +using string_pair = thrust::pair; + +TEST_F(StringsFactoryTest, StringConstructionFromPairs) +{ + auto const stream = cudf::test::get_default_stream(); + + auto const h_data = std::vector{'a', 'b', 'c'}; + auto const d_data = cudf::detail::make_device_uvector_async( + h_data, stream, cudf::get_current_device_resource_ref()); + + auto const h_input = + std::vector{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}}; + auto const d_input = cudf::detail::make_device_uvector_async( + h_input, stream, cudf::get_current_device_resource_ref()); + auto const input = cudf::device_span{d_input.data(), d_input.size()}; + cudf::make_strings_column(input, stream); +} + +TEST_F(StringsFactoryTest, StringBatchConstruction) +{ + auto const stream = cudf::test::get_default_stream(); + + auto const h_data = std::vector{'a', 'b', 'c'}; + auto const d_data = cudf::detail::make_device_uvector_async( + h_data, stream, cudf::get_current_device_resource_ref()); + + auto const h_input = + std::vector{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}}; + auto const d_input = cudf::detail::make_device_uvector_async( + h_input, stream, cudf::get_current_device_resource_ref()); + + std::vector> input( + 10, cudf::device_span{d_input.data(), d_input.size()}); + cudf::make_strings_column_batch(input, stream); +} diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp index 52839c6fc9f..e5a1ee0988c 100644 --- a/cpp/tests/streams/strings/find_test.cpp +++ b/cpp/tests/streams/strings/find_test.cpp @@ -46,4 +46,5 @@ TEST_F(StringsFindTest, Find) auto const pattern = std::string("[a-z]"); auto const prog = cudf::strings::regex_program::create(pattern); cudf::strings::findall(view, *prog, cudf::test::get_default_stream()); + cudf::strings::find_re(view, *prog, cudf::test::get_default_stream()); } diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp index 4b4d0a7aff5..154e1c1b715 100644 --- a/cpp/tests/streams/strings/reverse_test.cpp +++ b/cpp/tests/streams/strings/reverse_test.cpp @@ -21,7 +21,6 @@ #include #include -#include class StringsReverseTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/text/subword_tokenize_test.cpp b/cpp/tests/streams/text/subword_tokenize_test.cpp new file mode 100644 index 00000000000..9474e6b269c --- /dev/null +++ b/cpp/tests/streams/text/subword_tokenize_test.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +// Global environment for temporary files +auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +struct TextSubwordTest : public cudf::test::BaseFixture {}; + +// Create a fake hashed vocab text file for the tests in this source file. +// The vocab only includes the following words: +// 'this', 'is', 'a', 'test', 'tést' +// The period '.' character also has a token id. +void create_hashed_vocab(std::string const& hash_file) +{ + constexpr size_t coefsize = 23; + std::vector> coefficients(coefsize, {65559, 0}); + std::ofstream outfile(hash_file, std::ofstream::out); + outfile << "1\n0\n" << coefficients.size() << "\n"; + for (auto c : coefficients) { + outfile << c.first << " " << c.second << "\n"; + } + std::vector hash_table(coefsize, 0); + outfile << hash_table.size() << "\n"; + hash_table[0] = 3015668L; // based on values + hash_table[1] = 6205475701751155871L; // from the + hash_table[5] = 6358029; // bert_hash_table.txt + hash_table[16] = 451412625363L; // file for the test + hash_table[20] = 6206321707968235495L; // words above + for (auto h : hash_table) { + outfile << h << "\n"; + } + outfile << "100\n101\n102\n\n"; +} + +TEST(TextSubwordTest, Tokenize) +{ + uint32_t const nrows = 100; + std::vector h_strings(nrows, "This is a test. A test this is."); + cudf::test::strings_column_wrapper strings(h_strings.cbegin(), h_strings.cend()); + std::string const hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); + create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file, cudf::test::get_default_stream()); + + uint32_t const max_sequence_length = 16; + uint32_t const stride = 16; + + auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, + *vocab, + max_sequence_length, + stride, + true, // do_lower_case + false, // do_truncate + cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp index 9187672221c..9f168abcb31 100644 --- a/cpp/tests/streams/transform_test.cpp +++ b/cpp/tests/streams/transform_test.cpp @@ -15,24 +15,18 @@ */ #include -#include #include #include -#include -#include #include -#include -#include #include -#include #include #include class TransformTest : public cudf::test::BaseFixture {}; template -void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx) +void test_udf(char const* udf, Data data_init, cudf::size_type size, bool is_ptx) { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); diff --git a/cpp/tests/streams/transpose_test.cpp b/cpp/tests/streams/transpose_test.cpp new file mode 100644 index 00000000000..44e785435a5 --- /dev/null +++ b/cpp/tests/streams/transpose_test.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +class TransposeTest : public cudf::test::BaseFixture {}; + +TEST_F(TransposeTest, StringTest) +{ + using ColumnWrapper = cudf::test::strings_column_wrapper; + size_t ncols = 10; + size_t nrows = 10; + + std::mt19937 rng(1); + + auto const values = [&rng, nrows, ncols]() { + std::vector> values(ncols); + std::for_each(values.begin(), values.end(), [&rng, nrows](std::vector& col) { + col.resize(nrows); + std::generate(col.begin(), col.end(), [&rng]() { return std::to_string(rng()); }); + }); + return values; + }(); + + auto input_cols = [&values]() { + std::vector columns; + columns.reserve(values.size()); + for (auto const& value_col : values) { + columns.emplace_back(value_col.begin(), value_col.end()); + } + return columns; + }(); + + auto input_view = [&input_cols]() { + std::vector views(input_cols.size()); + std::transform(input_cols.begin(), input_cols.end(), views.begin(), [](auto const& col) { + return static_cast(col); + }); + return cudf::table_view(views); + }(); + + auto result = transpose(input_view, cudf::test::get_default_stream()); +} + +CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp index 9c0ecaa52c0..06b9c2fa3c1 100644 --- a/cpp/tests/strings/array_tests.cpp +++ b/cpp/tests/strings/array_tests.cpp @@ -23,10 +23,8 @@ #include #include #include -#include #include #include -#include #include #include diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index 7e530b2a34d..5923f8dee5a 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -24,6 +24,7 @@ #include +#include #include struct StringsCharsTest : public cudf::test::BaseFixture {}; @@ -50,20 +51,20 @@ TEST_P(CharsTypes, AllTypes) "de", "\t\r\n\f "}; - bool expecteds[] = {false, false, false, false, false, false, false, false, - false, false, false, false, false, true, false, false, // decimal - false, false, false, false, false, false, false, false, - false, true, false, true, false, true, false, false, // numeric - false, false, false, false, false, false, false, false, - false, false, false, true, false, true, false, false, // digit - true, true, false, true, false, false, false, false, - false, false, false, false, false, false, true, false, // alpha - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, true, // space - false, false, false, true, false, false, false, false, - false, false, false, false, false, false, false, false, // upper - false, true, false, false, false, false, false, false, - false, false, false, false, false, false, true, false}; // lower + std::array expecteds{false, false, false, false, false, false, false, false, + false, false, false, false, false, true, false, false, // decimal + false, false, false, false, false, false, false, false, + false, true, false, true, false, true, false, false, // numeric + false, false, false, false, false, false, false, false, + false, false, false, true, false, true, false, false, // digit + true, true, false, true, false, false, false, false, + false, false, false, false, false, false, true, false, // alpha + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, true, // space + false, false, false, true, false, false, false, false, + false, false, false, false, false, false, false, false, // upper + false, true, false, false, false, false, false, false, + false, false, false, false, false, false, true, false}; // lower auto is_parm = GetParam(); diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index bb57d6f5e8a..e53adcf373a 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp index 00317146088..c92f1cfc8f8 100644 --- a/cpp/tests/strings/combine/join_list_elements_tests.cpp +++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp @@ -22,7 +22,6 @@ #include #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp index 5cf4015b9e9..51dcc60d95e 100644 --- a/cpp/tests/strings/concatenate_tests.cpp +++ b/cpp/tests/strings/concatenate_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ #include #include -#include #include diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index acf850c7a66..cceec1d3537 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -32,6 +32,7 @@ #include #include +#include #include struct StringsContainsTests : public cudf::test::BaseFixture {}; @@ -167,10 +168,8 @@ TEST_F(StringsContainsTests, MatchesTest) auto strings_view = cudf::strings_column_view(strings); { auto const pattern = std::string("lazy"); - bool h_expected[] = {false, false, true, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, true, false, false, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -178,10 +177,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string("\\d+"); - bool h_expected[] = {false, false, false, true, true, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, false, true, true, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -189,10 +186,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string("@\\w+"); - bool h_expected[] = {false, false, false, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, false, false, false, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -200,10 +195,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string(".*"); - bool h_expected[] = {true, true, true, true, true, false, true}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {true, true, true, true, true, false, true}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -335,9 +328,9 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter) { std::vector data(10); std::generate(data.begin(), data.end(), [n = 0]() mutable { - char first = static_cast('A' + n++); - char raw_data[] = {first, '\0', 'B'}; - return std::string{raw_data, 3}; + char first = static_cast('A' + n++); + std::array raw_data = {first, '\0', 'B'}; + return std::string{raw_data.data(), 3}; }); cudf::test::strings_column_wrapper input(data.begin(), data.end()); auto strings_view = cudf::strings_column_view(input); @@ -481,6 +474,54 @@ TEST_F(StringsContainsTests, FixedQuantifier) } } +TEST_F(StringsContainsTests, ZeroRangeQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"a", "", "abc", "XYAZ", "ABC", "ZYXA"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("A{0,}"); // should match everyting + auto prog = cudf::strings::regex_program::create(pattern); + + { + auto expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1}); + auto results = cudf::strings::contains_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto expected = cudf::test::fixed_width_column_wrapper({2, 1, 4, 5, 4, 5}); + auto results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + + pattern = std::string("(?:ab){0,3}"); + prog = cudf::strings::regex_program::create(pattern); + + { + auto expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1}); + auto results = cudf::strings::contains_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto expected = cudf::test::fixed_width_column_wrapper({2, 1, 3, 5, 4, 5}); + auto results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + +TEST_F(StringsContainsTests, NestedQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555", + "0000 AAAA 9999 BBBB 8888", + "7777 6666 4444 3333", + "12345 3333 4444 1111 ABCD"}); + auto sv = cudf::strings_column_view(input); + auto pattern = std::string(R"((\d{4}\s){4})"); + cudf::test::fixed_width_column_wrapper expected({true, false, false, true}); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::contains_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, QuantifierErrors) { EXPECT_THROW(cudf::strings::regex_program::create("^+"), cudf::logic_error); @@ -749,11 +790,11 @@ TEST_F(StringsContainsTests, ASCII) auto input = cudf::test::strings_column_wrapper({"abc \t\f\r 12", "áé  ❽❽", "aZ ❽4", "XYZ 8"}); auto view = cudf::strings_column_view(input); - std::string patterns[] = {R"(\w+[\s]+\d+)", - R"([^\W]+\s+[^\D]+)", - R"([\w]+[^\S]+[\d]+)", - R"([\w]+\s+[\d]+)", - R"(\w+\s+\d+)"}; + std::array patterns = {R"(\w+[\s]+\d+)", + R"([^\W]+\s+[^\D]+)", + R"([\w]+[^\S]+[\d]+)", + R"([\w]+\s+[\d]+)", + R"(\w+\s+\d+)"}; for (auto ptn : patterns) { auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0}); @@ -787,24 +828,18 @@ TEST_F(StringsContainsTests, MediumRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::contains_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::matches_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, *prog); - int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::count_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } @@ -828,24 +863,18 @@ TEST_F(StringsContainsTests, LargeRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::contains_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::matches_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, *prog); - int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::count_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index b3dc3010c67..da0db0fc056 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp index 86189b29981..f2e31339035 100644 --- a/cpp/tests/strings/durations_tests.cpp +++ b/cpp/tests/strings/durations_tests.cpp @@ -24,6 +24,7 @@ #include +#include #include struct StringsDurationsTest : public cudf::test::BaseFixture {}; @@ -403,17 +404,17 @@ TEST_F(StringsDurationsTest, ParseSingle) "01", ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0}; - auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 3600}; }); + std::array expected_v{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0}; + auto it1 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_s{i * 3600}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%H"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); - auto it2 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 60}; }); + auto it2 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_s{i * 60}; }); cudf::test::fixed_width_column_wrapper expected_s2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -421,14 +422,14 @@ TEST_F(StringsDurationsTest, ParseSingle) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s2); auto it3 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s3(it3, it3 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%S"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s3); - auto it4 = thrust::make_transform_iterator(expected_v, + auto it4 = thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ms{i * 60000}; }); cudf::test::fixed_width_column_wrapper expected_ms(it4, it4 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), @@ -454,21 +455,21 @@ TEST_F(StringsDurationsTest, ParseMultiple) "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0, - -1, - -(3600 + 60 + 1), - 23 * 3600 + 1, - -(23 * 3600 + 1), - 59 * 3600, - -59 * 3600, - 99 * 3600, - -99 * 3600, - 0, - 3661, - 0}; + std::array expected_v{0, + 0, + -1, + -(3600 + 60 + 1), + 23 * 3600 + 1, + -(23 * 3600 + 1), + 59 * 3600, + -59 * 3600, + 99 * 3600, + -99 * 3600, + 0, + 3661, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -476,7 +477,7 @@ TEST_F(StringsDurationsTest, ParseMultiple) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); auto it2 = thrust::make_transform_iterator( - expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); + expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); cudf::test::fixed_width_column_wrapper expected_D2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -508,28 +509,28 @@ TEST_F(StringsDurationsTest, ParseSubsecond) "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); - int64_t expected_v[]{0, - -123456789L, - -1000666999L, - -((3600 + 60 + 1) * 1000000000L + 100000000L), - (23 * 3600 + 1) * 1000000000L + 80L, - -((23 * 3600 + 1) * 1000000000L + 123000000L), - (59 * 3600) * 1000000000L, - -(59 * 3600) * 1000000000L, - (99 * 3600) * 1000000000L, - -(99 * 3600) * 1000000000L, - 0, - (3661) * 1000000000L, - 0}; + std::array expected_v{0, + -123456789L, + -1000666999L, + -((3600 + 60 + 1) * 1000000000L + 100000000L), + (23 * 3600 + 1) * 1000000000L + 80L, + -((23 * 3600 + 1) * 1000000000L + 123000000L), + (59 * 3600) * 1000000000L, + -(59 * 3600) * 1000000000L, + (99 * 3600) * 1000000000L, + -(99 * 3600) * 1000000000L, + 0, + (3661) * 1000000000L, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ns{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ns{i}; }); cudf::test::fixed_width_column_wrapper expected_ns1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%H:%M:%S"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ns1); - auto it2 = thrust::make_transform_iterator(expected_v, + auto it2 = thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ms{i / 1000000}; }); cudf::test::fixed_width_column_wrapper expected_ms2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), @@ -559,25 +560,25 @@ TEST_F(StringsDurationsTest, ParseAMPM) "01:01:01", // error ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0 + 12 * 3600, - 0, - 0 - 12 * 3600, - -1, - -1 - 12 * 3600, - -(3600 + 60 + 1), - -(3600 + 60 + 1) - 12 * 3600, - 11 * 3600 + 59 * 60 + 59, - 11 * 3600 + 59 * 60 + 59 + 12 * 3600, - -(11 * 3600 + 59 * 60 + 59), - -(11 * 3600 + 59 * 60 + 59 + 12 * 3600), - 0, - 0, - 0, - 0, - 0}; + std::array expected_v{0, + 0 + 12 * 3600, + 0, + 0 - 12 * 3600, + -1, + -1 - 12 * 3600, + -(3600 + 60 + 1), + -(3600 + 60 + 1) - 12 * 3600, + 11 * 3600 + 59 * 60 + 59, + 11 * 3600 + 59 * 60 + 59 + 12 * 3600, + -(11 * 3600 + 59 * 60 + 59), + -(11 * 3600 + 59 * 60 + 59 + 12 * 3600), + 0, + 0, + 0, + 0, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -585,7 +586,7 @@ TEST_F(StringsDurationsTest, ParseAMPM) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); auto it2 = thrust::make_transform_iterator( - expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); + expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); cudf::test::fixed_width_column_wrapper expected_D2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -616,20 +617,20 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "01:01:01", // error ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0 + 12 * 3600, - 1, - 1 + 12 * 3600, - (3600 + 60 + 1), - (3600 + 60 + 1) + 12 * 3600, - 11 * 3600 + 59 * 60 + 59, - 11 * 3600 + 59 * 60 + 59 + 12 * 3600, - 0, - 0, - 0, - 0}; + std::array expected_v{0, + 0 + 12 * 3600, + 1, + 1 + 12 * 3600, + (3600 + 60 + 1), + (3600 + 60 + 1) + 12 * 3600, + 11 * 3600 + 59 * 60 + 59, + 11 * 3600 + 59 * 60 + 59 + 12 * 3600, + 0, + 0, + 0, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -641,8 +642,8 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "%OI:%OM:%OS %p"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); - auto it2 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ms{i * 1000}; }); + auto it2 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_ms{i * 1000}; }); cudf::test::fixed_width_column_wrapper expected_s2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 1491da758d5..37b25d9b287 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -19,10 +19,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -240,6 +238,21 @@ TEST_F(StringsExtractTests, SpecialNewLines) CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); } +TEST_F(StringsExtractTests, NestedQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555", + "0000 AAAA 9999 BBBB 8888", + "7777 6666 4444 3333", + "12345 3333 4444 1111 ABCD"}); + auto sv = cudf::strings_column_view(input); + auto pattern = std::string(R"((\d{4}\s){4})"); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::extract(sv, *prog); + // fixed quantifier on capture group only honors the last group + auto expected = cudf::test::strings_column_wrapper({"4444 ", "", "", "1111 "}, {1, 0, 0, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; @@ -275,8 +288,8 @@ TEST_F(StringsExtractTests, ExtractAllTest) auto pattern = std::string("(\\d+) (\\w+)"); - bool valids[] = {true, true, true, false, false, false, true}; - using LCW = cudf::test::lists_column_wrapper; + std::array valids{true, true, true, false, false, false, true}; + using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"123", "banana", "7", "eleven"}, LCW{"41", "apple"}, LCW{"6", "péar", "0", "pair"}, @@ -284,7 +297,7 @@ TEST_F(StringsExtractTests, ExtractAllTest) LCW{}, LCW{}, LCW{"4", "paré"}}, - valids); + valids.data()); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::extract_all_record(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 90054e41d36..7eb429da7d9 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,8 @@ struct StringsFactoriesTest : public cudf::test::BaseFixture {}; +using string_pair = thrust::pair; + TEST_F(StringsFactoriesTest, CreateColumnFromPair) { std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", @@ -61,7 +64,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) cudf::size_type count = (cudf::size_type)h_test_strings.size(); thrust::host_vector h_buffer(memsize); rmm::device_uvector d_buffer(memsize, cudf::get_default_stream()); - thrust::host_vector> strings(count); + thrust::host_vector strings(count); thrust::host_vector h_offsets(count + 1); cudf::size_type offset = 0; cudf::size_type nulls = 0; @@ -69,12 +72,12 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) for (cudf::size_type idx = 0; idx < count; ++idx) { char const* str = h_test_strings[idx]; if (!str) { - strings[idx] = thrust::pair{nullptr, 0}; + strings[idx] = string_pair{nullptr, 0}; nulls++; } else { auto length = (cudf::size_type)strlen(str); memcpy(h_buffer.data() + offset, str, length); - strings[idx] = thrust::pair{d_buffer.data() + offset, length}; + strings[idx] = string_pair{d_buffer.data() + offset, length}; offset += length; } h_offsets[idx + 1] = offset; @@ -201,14 +204,13 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn) cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release()); cudf::test::expect_column_empty(results->view()); - rmm::device_uvector> d_strings{ - 0, cudf::get_default_stream()}; + rmm::device_uvector d_strings{0, cudf::get_default_stream()}; results = cudf::make_strings_column(d_strings); cudf::test::expect_column_empty(results->view()); } namespace { -using string_pair = thrust::pair; + struct string_view_to_pair { __device__ string_pair operator()(thrust::pair const& p) { @@ -234,3 +236,198 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty) auto result = cudf::make_strings_column(pairs); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data); } + +struct StringsBatchConstructionTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsBatchConstructionTest, EmptyColumns) +{ + auto constexpr num_columns = 10; + auto const stream = cudf::get_default_stream(); + + auto const d_string_pairs = rmm::device_uvector{0, stream}; + auto const input = std::vector>( + num_columns, {d_string_pairs.data(), d_string_pairs.size()}); + auto const output = cudf::make_strings_column_batch(input, stream); + + auto const expected_col = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + for (auto const& col : output) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view()); + } +} + +TEST_F(StringsBatchConstructionTest, AllNullsColumns) +{ + auto constexpr num_columns = 10; + auto constexpr num_rows = 100; + auto const stream = cudf::get_default_stream(); + + auto d_string_pairs = rmm::device_uvector{num_rows, stream}; + thrust::uninitialized_fill_n(rmm::exec_policy(stream), + d_string_pairs.data(), + d_string_pairs.size(), + string_pair{nullptr, 0}); + auto const input = std::vector>( + num_columns, {d_string_pairs.data(), d_string_pairs.size()}); + auto const output = cudf::make_strings_column_batch(input, stream); + + auto const expected_col = cudf::make_strings_column(d_string_pairs); + for (auto const& col : output) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view()); + } +} + +namespace { + +struct index_to_pair { + int const num_test_strings; + char const* d_chars; + std::size_t const* d_offsets; + int const* is_null; + + __device__ string_pair operator()(cudf::size_type idx) + { + auto const data_idx = idx % num_test_strings; + return {is_null[data_idx] ? nullptr : d_chars + d_offsets[data_idx], + static_cast(d_offsets[data_idx + 1] - d_offsets[data_idx])}; + } +}; + +} // namespace + +TEST_F(StringsBatchConstructionTest, CreateColumnsFromPairs) +{ + auto constexpr num_columns = 10; + auto constexpr max_num_rows = 1000; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", + nullptr, + "absent stop words"}; + auto const num_test_strings = static_cast(h_test_strings.size()); + + std::vector h_offsets(num_test_strings + 1, 0); + for (int i = 0; i < num_test_strings; ++i) { + h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0); + } + + std::vector h_chars(h_offsets.back()); + std::vector is_null(num_test_strings, 0); + for (int i = 0; i < num_test_strings; ++i) { + if (h_test_strings[i]) { + memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i])); + } else { + is_null[i] = 1; + } + } + + auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr); + auto const d_chars = cudf::detail::make_device_uvector_async(h_chars, stream, mr); + auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr); + + std::vector> d_input; + std::vector> input; + d_input.reserve(num_columns); + input.reserve(num_columns); + + for (int col_idx = 0; col_idx < num_columns; ++col_idx) { + // Columns have sizes increase from `max_num_rows / num_columns` to `max_num_rows`. + auto const num_rows = + static_cast(static_cast(col_idx + 1) / num_columns * max_num_rows); + + auto string_pairs = rmm::device_uvector(num_rows, stream); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()}); + + d_input.emplace_back(std::move(string_pairs)); + input.emplace_back(d_input.back()); + } + + auto const output = cudf::make_strings_column_batch(input, stream, mr); + + for (std::size_t i = 0; i < num_columns; ++i) { + auto const string_pairs = input[i]; + auto const expected = cudf::make_strings_column(string_pairs, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view()); + } +} + +// The test below requires a huge amount of memory, thus it is disabled by default. +TEST_F(StringsBatchConstructionTest, DISABLED_CreateLongStringsColumns) +{ + auto constexpr num_columns = 2; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", + nullptr, + "absent stop words"}; + auto const num_test_strings = static_cast(h_test_strings.size()); + + std::vector h_offsets(num_test_strings + 1, 0); + for (int i = 0; i < num_test_strings; ++i) { + h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0); + } + + std::vector h_chars(h_offsets.back()); + std::vector is_null(num_test_strings, 0); + for (int i = 0; i < num_test_strings; ++i) { + if (h_test_strings[i]) { + memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i])); + } else { + is_null[i] = 1; + } + } + + auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr); + auto const d_chars = cudf::detail::make_device_uvector_async(h_chars, stream, mr); + auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr); + + // If we create a column by repeating h_test_strings by `max_cycles` times, + // we will have it size around (1.5*INT_MAX) bytes. + auto const max_cycles = static_cast(static_cast(std::numeric_limits::max()) * + 1.5 / h_offsets.back()); + + std::vector> d_input; + std::vector> input; + d_input.reserve(num_columns); + input.reserve(num_columns); + + for (int col_idx = 0; col_idx < num_columns; ++col_idx) { + // Columns have sizes increase from `max_cycles * num_test_strings / num_columns` to + // `max_cycles * num_test_strings`. + auto const num_rows = static_cast(static_cast(col_idx + 1) / num_columns * + max_cycles * num_test_strings); + + auto string_pairs = rmm::device_uvector(num_rows, stream); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()}); + + d_input.emplace_back(std::move(string_pairs)); + input.emplace_back(d_input.back()); + } + + auto const output = cudf::make_strings_column_batch(input, stream, mr); + + for (std::size_t i = 0; i < num_columns; ++i) { + auto const string_pairs = input[i]; + auto const expected = cudf::make_strings_column(string_pairs, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view()); + } +} diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp index 41a5940c880..3c8483b153d 100644 --- a/cpp/tests/strings/find_multiple_tests.cpp +++ b/cpp/tests/strings/find_multiple_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -75,8 +76,158 @@ TEST_F(StringsFindMultipleTest, ErrorTest) auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); auto empty_view = cudf::strings_column_view(zero_size_strings_column); // targets must have at least one string - EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, empty_view), std::invalid_argument); // targets cannot have nulls - EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), cudf::logic_error); + EXPECT_THROW(cudf::strings::find_multiple(strings_view, strings_view), std::invalid_argument); + EXPECT_THROW(cudf::strings::contains_multiple(strings_view, strings_view), std::invalid_argument); +} + +TEST_F(StringsFindMultipleTest, MultiContains) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 9 rows: + std::vector s = { + "Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", + "", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + auto strings_view = cudf::strings_column_view(strings); + std::vector match_targets({" the ", "a", "", "é"}); + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = + cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column)); + + std::vector ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0}; + std::vector ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + + auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindMultipleTest, MultiContainsMoreTargets) +{ + auto const strings = cudf::test::strings_column_wrapper{ + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position"}; + auto strings_view = cudf::strings_column_view(strings); + std::vector targets({"lazy brown", "non-exist", ""}); + + std::vector> expects; + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({0, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 1, 1})); + + std::vector match_targets; + int max_num_targets = 50; + + for (int num_targets = 1; num_targets < max_num_targets; num_targets++) { + match_targets.clear(); + for (int i = 0; i < num_targets; i++) { + match_targets.push_back(targets[i % targets.size()]); + } + + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = cudf::strings::contains_multiple( + strings_view, cudf::strings_column_view(multi_targets_column)); + EXPECT_EQ(results->num_columns(), num_targets); + for (int i = 0; i < num_targets; i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]); + } + } +} + +TEST_F(StringsFindMultipleTest, MultiContainsLongStrings) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 7 rows: + std::vector s = { + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position", + "algorithms execution is parallelized as determined by an execution policy. t algorithms " + "execution is parallelized as ", + "he this is a continuation of previous row to make sure string boundaries are honored he this " + "is a continuation of previous row", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ " + "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + + auto sv = cudf::strings_column_view(strings); + auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "}); + auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets)); + + std::vector ret_0 = {1, 0, 1, 0, 0, 0, 0}; + std::vector ret_1 = {0, 1, 0, 0, 0, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {0, 0, 0, 0, 1, 0, 0}; + std::vector ret_4 = {1, 0, 0, 0, 0, 0, 0}; + std::vector ret_5 = {0, 0, 1, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + auto expected_4 = make_bool_col_fn(ret_4); + auto expected_5 = make_bool_col_fn(ret_5); + + auto expected = + cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); } diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 2da95ba5c27..a3066c40650 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -17,16 +17,14 @@ #include #include #include +#include -#include #include #include #include #include #include -#include - #include struct StringsFindTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 47606b9b3ed..7eb4b32d078 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -27,16 +27,14 @@ #include -#include - struct StringsFindallTests : public cudf::test::BaseFixture {}; TEST_F(StringsFindallTests, FindallTest) { - bool valids[] = {true, true, true, true, true, false, true, true}; + std::array valids{true, true, true, true, true, false, true, true}; cudf::test::strings_column_wrapper input( {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"}, - valids); + valids.data()); auto sv = cudf::strings_column_view(input); auto pattern = std::string("(\\d+)-(\\w+)"); @@ -50,7 +48,7 @@ TEST_F(StringsFindallTests, FindallTest) LCW{}, LCW{}, LCW{"25-9000"}}, - valids); + valids.data()); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::findall(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); @@ -148,3 +146,53 @@ TEST_F(StringsFindallTests, LargeRegex) LCW expected({LCW{large_regex.c_str()}, LCW{}, LCW{}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } + +TEST_F(StringsFindallTests, FindTest) +{ + auto const valids = cudf::test::iterators::null_at(5); + cudf::test::strings_column_wrapper input( + {"3A", "May4", "Jan2021", "March", "A9BC", "", "", "abcdef ghijklm 12345"}, valids); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("\\d+"); + + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::find_re(sv, *prog); + auto expected = + cudf::test::fixed_width_column_wrapper({0, 3, 3, -1, 1, 0, -1, 15}, valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindallTests, NoMatches) +{ + cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("(^zzz$)"); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}, LCW{}, LCW{}}); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::findall(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindallTests, EmptyTest) +{ + std::string pattern = R"(\w+)"; + + auto prog = cudf::strings::regex_program::create(pattern); + + cudf::test::strings_column_wrapper input; + auto sv = cudf::strings_column_view(input); + { + auto results = cudf::strings::findall(sv, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + } + { + auto results = cudf::strings::find_re(sv, *prog); + auto expected = cudf::test::fixed_width_column_wrapper{}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + } +} diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 79054551498..b788c05c152 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -23,8 +23,6 @@ #include #include -#include - struct StringsConvertTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index ce5f68de3c9..c08effdb969 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -24,12 +24,10 @@ #include #include -#include -#include - #include #include +#include #include #include @@ -425,7 +423,7 @@ TYPED_TEST(StringsIntegerConvertTest, IntegerToHex) if (v == 0) { return std::string("00"); } // special handling for single-byte types if constexpr (std::is_same_v || std::is_same_v) { - char const hex_digits[16] = { + std::array const hex_digits = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; std::string str; str += hex_digits[(v & 0xF0) >> 4]; diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 9847d8d6bb5..abc12b00a81 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -200,6 +200,34 @@ TEST_F(StringsReplaceRegexTest, ZeroLengthMatch) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +TEST_F(StringsReplaceRegexTest, ZeroRangeQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"a", "", "123", "XYAZ", "abc", "zéyab"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("A{0,5}"); + auto prog = cudf::strings::regex_program::create(pattern); + auto repl = cudf::string_scalar("_"); + auto expected = cudf::test::strings_column_wrapper( + {"_a_", "_", "_1_2_3_", "_X_Y__Z_", "_a_b_c_", "_z_é_y_a_b_"}); + auto results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + pattern = std::string("[a0-9]{0,2}"); + prog = cudf::strings::regex_program::create(pattern); + expected = + cudf::test::strings_column_wrapper({"__", "_", "___", "_X_Y_A_Z_", "__b_c_", "_z_é_y__b_"}); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + pattern = std::string("(?:ab){0,3}"); + prog = cudf::strings::regex_program::create(pattern); + expected = + cudf::test::strings_column_wrapper({"_a_", "_", "_1_2_3_", "_X_Y_A_Z_", "__c_", "_z_é_y__"}); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceRegexTest, Multiline) { auto const multiline = cudf::strings::regex_flags::MULTILINE; diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index f0010fc1ed9..a34ff25cb69 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -17,28 +17,18 @@ #include #include #include -#include #include #include #include -#include #include -#include #include -#include -#include -#include -#include #include #include #include -#include #include -#include -#include #include #include @@ -635,9 +625,8 @@ TEST_F(StructColumnWrapperTest, TestStructsColumnWithEmptyChild) auto mask_vec = std::vector{true, false, false}; auto [null_mask, null_count] = cudf::test::detail::make_null_mask(mask_vec.begin(), mask_vec.end()); - auto structs_col = - cudf::make_structs_column(num_rows, std::move(cols), null_count, std::move(null_mask)); - EXPECT_NO_THROW(structs_col->view()); + EXPECT_NO_THROW(auto structs_col = cudf::make_structs_column( + num_rows, std::move(cols), null_count, std::move(null_mask))); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp index c33eedf9bd9..c0df2f01a63 100644 --- a/cpp/tests/structs/utilities_tests.cpp +++ b/cpp/tests/structs/utilities_tests.cpp @@ -14,21 +14,15 @@ * limitations under the License. */ -#include "cudf_test/default_stream.hpp" - #include #include #include -#include #include #include #include -#include -#include #include #include -#include #include #include diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp index 5fa63c47cf0..216c4d7b6bb 100644 --- a/cpp/tests/table/row_operators_tests.cpp +++ b/cpp/tests/table/row_operators_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/table/table_tests.cpp b/cpp/tests/table/table_tests.cpp index 1637ba7d7d3..363f1a0ba5d 100644 --- a/cpp/tests/table/table_tests.cpp +++ b/cpp/tests/table/table_tests.cpp @@ -17,17 +17,14 @@ #include #include #include -#include #include #include #include -#include #include #include #include -#include template using column_wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index e23f3f6e7d8..042ac44621e 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -21,166 +21,176 @@ #include #include -#include #include -#include -#include - #include struct MinHashTest : public cudf::test::BaseFixture {}; -TEST_F(MinHashTest, Basic) +TEST_F(MinHashTest, Permuted) { - auto validity = cudf::test::iterators::null_at(1); auto input = cudf::test::strings_column_wrapper({"doc 1", - "", "this is doc 2", - "", "doc 3", "d", - "The quick brown fox jumpéd over the lazy brown dog."}, - validity); + "The quick brown fox jumpéd over the lazy brown dog.", + "line six", + "line seven", + "line eight", + "line nine", + "line ten"}); auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1392101586u, 394869177u, 811528444u}, + LCW32{ 211415830u, 187088503u, 130291444u}, + LCW32{2098117052u, 394869177u, 799753544u}, + LCW32{2264583304u, 2920538364u, 3576493424u}, + LCW32{ 253327882u, 41747273u, 302030804u}, + LCW32{2109809594u, 1017470651u, 326988172u}, + LCW32{1303819864u, 850676747u, 147107852u}, + LCW32{ 736021564u, 720812292u, 1405158760u}, + LCW32{ 902780242u, 134064807u, 1613944636u}, + LCW32{ 547084870u, 1748895564u, 656501844u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto results64 = nvtext::minhash64(view); - auto expected64 = cudf::test::fixed_width_column_wrapper({774489391575805754ul, - 0ul, - 3232308021562742685ul, - 0ul, - 13145552576991307582ul, - 14660046701545912182ul, - 398062025280761388ul}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); -TEST_F(MinHashTest, LengthEqualsWidth) -{ - auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); - auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view, 0, 5); - auto expected = cudf::test::fixed_width_column_wrapper( - {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 827364888116975697ul, 1601854279692781452ul, 70500662054893256ul}, + LCW64{ 18312093741021833ul, 133793446674258329ul, 21974512489226198ul}, + LCW64{ 22474244732520567ul, 1638811775655358395ul, 949306297364502264ul}, + LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul}, + LCW64{ 65816830624808020ul, 43323600380520789ul, 63511816333816345ul}, + LCW64{ 629657184954525200ul, 49741036507643002ul, 97466271004074331ul}, + LCW64{ 301611977846331113ul, 101188874709594830ul, 97466271004074331ul}, + LCW64{ 121498891461700668ul, 171065800427907402ul, 97466271004074331ul}, + LCW64{ 54617739511834072ul, 231454301607238929ul, 97466271004074331ul}, + LCW64{ 576418665851990314ul, 231454301607238929ul, 97466271004074331ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeed) +TEST_F(MinHashTest, PermutedWide) { - auto input = - cudf::test::strings_column_wrapper({"doc 1", - "this is doc 2", - "doc 3", - "d", - "The quick brown fox jumpéd over the lazy brown dog."}); - - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); // below wide_string_threshold + std::string const wide(2 << 19, 'y'); // above wide_string_threshold + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; + using LCW32 = cudf::test::lists_column_wrapper; // clang-format off - LCW expected({LCW{1207251914u, 1677652962u, 1061355987u}, - LCW{ 21141582u, 580916568u, 1258052021u}, - LCW{1207251914u, 943567174u, 1109272887u}, - LCW{ 655955059u, 488346356u, 2394664816u}, - LCW{ 86520422u, 236622901u, 102546228u}}); + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u}, + LCW32{1293098788u, 2860992281u, 133918478u} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off - LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, - LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, - LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, - LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul}, - LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}}); + LCW64 expected64({ + LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul}, + LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeedWithNullInputRow) +TEST_F(MinHashTest, PermutedManyParameters) { - auto validity = cudf::test::iterators::null_at(1); - auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); + std::string const wide(2 << 19, 'y'); + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + // more than params_per_thread + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; - LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, - validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u, 1777049372u, 360410720u, 3238739364u, 1822100712u, 405462060u, + 3283790704u, 1867152052u, 450513400u, 3328842044u, 1912203392u, 495564740u, 3373893384u, 1957254732u, + 540616080u, 3418944724u, 2002306072u, 585667420u, 3463996064u, 2047357412u, 630718760u, 3509047404u, + 2092408752u, 675770100u, 3554098744u, 2137460092u, 720821440u, 3599150084u, 2182511432u}, + LCW32{1293098788u, 2860992281u, 133918478u, 1701811971u, 3269705464u, 542631661u, 2110525154u, 3678418647u, + 951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u, 200877717u, 1768771210u, 3336664703u, + 609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u, + 267836956u, 1835730449u, 3403623942u, 676550139u, 2244443632u, 3812337125u, 1085263322u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + // more than params_per_thread + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul}, - LCW64{}, - LCW64{0ul, 0ul}, - LCW64{2717781266371273264ul, 6977325820868387259ul}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - -TEST_F(MinHashTest, WordsMinHash) -{ - using LCWS = cudf::test::lists_column_wrapper; - auto validity = cudf::test::iterators::null_at(1); - - LCWS input( - {LCWS({"hello", "abcdéfgh"}), - LCWS{}, - LCWS({"rapids", "moré", "test", "text"}), - LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, - validity); - - auto view = cudf::lists_column_view(input); - - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); - using LCW32 = cudf::test::lists_column_wrapper; - LCW32 expected({LCW32{2069617641u, 1975382903u}, - LCW32{}, - LCW32{657297235u, 1010955999u}, - LCW32{644643885u, 310002789u}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); - using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, - LCW64{}, - LCW64{5331949571924938590ul, 2088583894581919741ul}, - LCW64{3400468157617183341ul, 2398577492366130055ul}}, - validity); + // clang-format off + LCW64 expected64({ + LCW64{1818322427062143853, 641024893347719371, 1769570368846988848, 592272835132564366, + 1720818310631833835, 543520776917409353, 1672066252416678822, 494768718702254348, + 1623314194201523817, 446016660487099335, 1574562135986368804, 397264602271944322, + 1525810077771213799, 348512544056789317, 1477058019556058786, 299760485841634304, + 1428305961340903773, 251008427626479291, 1379553903125748768, 202256369411324286, + 1330801844910593755, 153504311196169273, 1282049786695438742, 104752252981014268, + 1233297728480283737, 56000194765859255, 1184545670265128724, 7248136550704242, + 1135793612049973719, 2264339087549243188, 1087041553834818706}, + LCW64{1389920339306667795, 421787002125838902, 1759496674158703968, 791363336977875075, + 2129073009010740141, 1160939671829911248, 192806334649082363, 1530516006681947421, + 562382669501118536, 1900092341533983602, 931959004353154709, 2269668676386019775, + 1301535339205190882, 333402002024361997, 1671111674057227055, 702978336876398170, + 2040688008909263228, 1072554671728434343, 104421334547605450, 1442131006580470516, + 473997669399641631, 1811707341432506689, 843574004251677804, 2181283676284542862, + 1213150339103713977, 245017001922885084, 1582726673955750150, 614593336774921257, + 1952303008807786323, 984169671626957438, 16036334446128545} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto results = nvtext::minhash(view); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); - results = nvtext::minhash64(view); + auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + results = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -188,20 +198,39 @@ TEST_F(MinHashTest, ErrorsTest) { auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); - EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); - EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument); - auto seeds = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); - auto seeds64 = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument); + auto empty = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); + auto empty64 = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); view = cudf::strings_column_view(input); auto const zeroes = thrust::constant_iterator(0); - seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error); - seeds64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error); + auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), + std::overflow_error); + + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + std::invalid_argument); } diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index 1acb4fc4265..c72c7cfc80e 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -28,8 +28,6 @@ #include -#include - struct TextGenerateNgramsTest : public cudf::test::BaseFixture {}; TEST_F(TextGenerateNgramsTest, Ngrams) diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index b0d41004e7e..2515cc917fa 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/text/stemmer_tests.cpp b/cpp/tests/text/stemmer_tests.cpp index a343913411c..82c4bf53cfc 100644 --- a/cpp/tests/text/stemmer_tests.cpp +++ b/cpp/tests/text/stemmer_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index a615780c02a..782551ad66e 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -19,13 +19,11 @@ #include #include -#include #include #include #include -#include #include // Global environment for temporary files diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp index 215ca158f37..9437440f34d 100644 --- a/cpp/tests/transform/bools_to_mask_test.cpp +++ b/cpp/tests/transform/bools_to_mask_test.cpp @@ -20,10 +20,8 @@ #include #include -#include #include #include -#include #include @@ -32,7 +30,7 @@ struct MaskToNullTest : public cudf::test::BaseFixture { { cudf::test::fixed_width_column_wrapper input_column( input.begin(), input.end(), val.begin()); - std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and()); + std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and<>()); auto sample = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp index 5fa02d9978a..0bdf5b321ac 100644 --- a/cpp/tests/transform/integration/unary_transform_test.cpp +++ b/cpp/tests/transform/integration/unary_transform_test.cpp @@ -30,7 +30,7 @@ namespace transformation { struct UnaryOperationIntegrationTest : public cudf::test::BaseFixture {}; template -void test_udf(char const udf[], Op op, Data data_init, cudf::size_type size, bool is_ptx) +void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool is_ptx) { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); @@ -47,7 +47,7 @@ void test_udf(char const udf[], Op op, Data data_init, cudf::size_type size, boo TEST_F(UnaryOperationIntegrationTest, Transform_FP32_FP32) { // c = a*a*a*a - char const* cuda = + std::string const cuda = R"***( __device__ inline void fdsf ( float* C, @@ -58,7 +58,7 @@ __device__ inline void fdsf ( } )***"; - char const* ptx = + std::string const ptx = R"***( // // Generated by NVIDIA NVVM Compiler @@ -101,17 +101,17 @@ __device__ inline void fdsf ( auto op = [](dtype a) { return a * a * a * a; }; auto data_init = [](cudf::size_type row) { return row % 3; }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32) { // c = a * a - a - char const cuda[] = + std::string const cuda = "__device__ inline void f(int* output,int input){*output = input*input - input;}"; - char const* ptx = + std::string const ptx = R"***( .func _Z1fPii( .param .b64 _Z1fPii_param_0, @@ -136,8 +136,8 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32) auto op = [](dtype a) { return a * a - a; }; auto data_init = [](cudf::size_type row) { return row % 78; }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8) @@ -145,7 +145,7 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8) // Capitalize all the lower case letters // Assuming ASCII, the PTX code is compiled from the following CUDA code - char const cuda[] = + std::string const cuda = R"***( __device__ inline void f( signed char* output, @@ -159,7 +159,7 @@ __device__ inline void f( } )***"; - char const ptx[] = + std::string const ptx = R"***( .func _Z1fPcc( .param .b64 _Z1fPcc_param_0, @@ -191,15 +191,15 @@ __device__ inline void f( auto op = [](dtype a) { return std::toupper(a); }; auto data_init = [](cudf::size_type row) { return 'a' + (row % 26); }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_Datetime) { // Add one day to timestamp in microseconds - char const cuda[] = + std::string const cuda = R"***( __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input) { @@ -217,7 +217,7 @@ __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input) auto random_eng = cudf::test::UniformRandomGenerator(0, 100000000); auto data_init = [&random_eng](cudf::size_type row) { return random_eng.generate(); }; - test_udf(cuda, op, data_init, 500, false); + test_udf(cuda.c_str(), op, data_init, 500, false); } } // namespace transformation diff --git a/cpp/tests/transform/nans_to_null_test.cpp b/cpp/tests/transform/nans_to_null_test.cpp index ba16c100e7a..42ca872a936 100644 --- a/cpp/tests/transform/nans_to_null_test.cpp +++ b/cpp/tests/transform/nans_to_null_test.cpp @@ -17,12 +17,10 @@ #include #include #include -#include #include #include #include -#include template struct NaNsToNullTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp index 5a88c402b8c..7797b2b2cf8 100644 --- a/cpp/tests/transpose/transpose_test.cpp +++ b/cpp/tests/transpose/transpose_test.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp index 0d9092c33da..46468af515d 100644 --- a/cpp/tests/types/traits_test.cpp +++ b/cpp/tests/types/traits_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index 45b89b76070..ed4c1340dbb 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -20,18 +20,15 @@ #include #include -#include #include #include #include #include #include -#include #include #include -#include #include static auto const test_timestamps_D = std::vector{ diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp index 5bfbf70d5f9..663a919f3f4 100644 --- a/cpp/tests/unary/math_ops_test.cpp +++ b/cpp/tests/unary/math_ops_test.cpp @@ -22,10 +22,6 @@ #include #include #include -#include -#include - -#include #include diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp index e7477c34642..3c616461c74 100644 --- a/cpp/tests/unary/unary_ops_test.cpp +++ b/cpp/tests/unary/unary_ops_test.cpp @@ -23,7 +23,6 @@ #include #include -#include #include template diff --git a/cpp/tests/utilities/random_seed.cpp b/cpp/tests/utilities/random_seed.cpp index ab5a31ce161..555d89b7dc5 100644 --- a/cpp/tests/utilities/random_seed.cpp +++ b/cpp/tests/utilities/random_seed.cpp @@ -13,8 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include -#include +#include namespace cudf { namespace test { diff --git a/cpp/tests/utilities/table_utilities.cu b/cpp/tests/utilities/table_utilities.cu index 354c0b1b57e..8e4906408de 100644 --- a/cpp/tests/utilities/table_utilities.cu +++ b/cpp/tests/utilities/table_utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ */ #include +#include #include -#include - namespace cudf::test::detail { void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs) { diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu new file mode 100644 index 00000000000..98657f8e224 --- /dev/null +++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +template +struct BatchedMemcpyTest : public cudf::test::BaseFixture {}; + +TEST(BatchedMemcpyTest, BasicTest) +{ + using T1 = int64_t; + + // Device init + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + + // Buffer lengths (in number of elements) + std::vector const h_lens{ + 50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000}; + + // Total number of buffers + auto const num_buffs = h_lens.size(); + + // Exclusive sum of buffer lengths for pointers + std::vector h_lens_excl_sum(num_buffs); + std::exclusive_scan(h_lens.begin(), h_lens.end(), h_lens_excl_sum.begin(), 0); + + // Corresponding buffer sizes (in bytes) + std::vector h_sizes_bytes; + h_sizes_bytes.reserve(num_buffs); + std::transform( + h_lens.cbegin(), h_lens.cend(), std::back_inserter(h_sizes_bytes), [&](auto& size) { + return size * sizeof(T1); + }); + + // Initialize random engine + auto constexpr seed = 0xcead; + std::mt19937 engine{seed}; + using uniform_distribution = + typename std::conditional_t, + std::bernoulli_distribution, + std::conditional_t, + std::uniform_real_distribution, + std::uniform_int_distribution>>; + uniform_distribution dist{}; + + // Generate a src vector of random data vectors + std::vector> h_sources; + h_sources.reserve(num_buffs); + std::transform(h_lens.begin(), h_lens.end(), std::back_inserter(h_sources), [&](auto size) { + std::vector data(size); + std::generate_n(data.begin(), size, [&]() { return T1{dist(engine)}; }); + return data; + }); + // Copy the vectors to device + std::vector> h_device_vecs; + h_device_vecs.reserve(h_sources.size()); + std::transform( + h_sources.begin(), h_sources.end(), std::back_inserter(h_device_vecs), [stream, mr](auto& vec) { + return cudf::detail::make_device_uvector_async(vec, stream, mr); + }); + // Pointers to the source vectors + std::vector h_src_ptrs; + h_src_ptrs.reserve(h_sources.size()); + std::transform( + h_device_vecs.begin(), h_device_vecs.end(), std::back_inserter(h_src_ptrs), [](auto& vec) { + return static_cast(vec.data()); + }); + // Copy the source data pointers to device + auto d_src_ptrs = cudf::detail::make_device_uvector_async(h_src_ptrs, stream, mr); + + // Total number of elements in all buffers + auto const total_buff_len = std::accumulate(h_lens.cbegin(), h_lens.cend(), 0); + + // Create one giant buffer for destination + auto d_dst_data = cudf::detail::make_zeroed_device_uvector_async(total_buff_len, stream, mr); + // Pointers to destination buffers within the giant destination buffer + std::vector h_dst_ptrs(num_buffs); + std::for_each(thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(num_buffs), + [&](auto i) { return h_dst_ptrs[i] = d_dst_data.data() + h_lens_excl_sum[i]; }); + // Copy destination data pointers to device + auto d_dst_ptrs = cudf::detail::make_device_uvector_async(h_dst_ptrs, stream, mr); + + // Copy buffer size iterators (in bytes) to device + auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr); + + // Run the batched memcpy + cudf::detail::batched_memcpy_async( + d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream); + + // Expected giant destination buffer after the memcpy + std::vector expected_buffer; + expected_buffer.reserve(total_buff_len); + std::for_each(h_sources.cbegin(), h_sources.cend(), [&expected_buffer](auto& source) { + expected_buffer.insert(expected_buffer.end(), source.begin(), source.end()); + }); + + // Copy over the result destination buffer to host and synchronize the stream + auto result_dst_buffer = + cudf::detail::make_std_vector_sync(cudf::device_span(d_dst_data), stream); + + // Check if both vectors are equal + EXPECT_TRUE( + std::equal(expected_buffer.begin(), expected_buffer.end(), result_dst_buffer.begin())); +} diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu index bed0f40d70e..0eeb7b95318 100644 --- a/cpp/tests/utilities_tests/batched_memset_tests.cu +++ b/cpp/tests/utilities_tests/batched_memset_tests.cu @@ -18,8 +18,8 @@ #include #include +#include #include -#include #include #include #include @@ -78,7 +78,7 @@ TEST(MultiBufferTestIntegral, BasicTest1) }); // Function Call - cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream); + cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream); // Set all buffer regions to 0 for expected comparison std::for_each( diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp index 7aa05af4591..2a57d678d07 100644 --- a/cpp/tests/utilities_tests/column_debug_tests.cpp +++ b/cpp/tests/utilities_tests/column_debug_tests.cpp @@ -16,12 +16,9 @@ #include #include -#include #include #include -#include - #include #include diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index 9d6d5ccb9b5..a13ce825d0b 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -17,20 +17,16 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include -#include - template struct ColumnUtilitiesTest : public cudf::test::BaseFixture { cudf::test::UniformRandomGenerator random; diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp index 479c6687e75..339678f3be8 100644 --- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp index 5e3fda5e6f7..ff50dc39979 100644 --- a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index d052e20eedb..cfab570833b 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -28,16 +28,17 @@ class LoggerTest : public cudf::test::BaseFixture { std::vector prev_sinks; public: - LoggerTest() : prev_level{cudf::logger().level()}, prev_sinks{cudf::logger().sinks()} + LoggerTest() + : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} { - cudf::logger().sinks() = {std::make_shared(oss)}; - cudf::logger().set_formatter( + cudf::detail::logger().sinks() = {std::make_shared(oss)}; + cudf::detail::logger().set_formatter( std::unique_ptr(new spdlog::pattern_formatter("%v"))); } ~LoggerTest() override { - cudf::logger().set_level(prev_level); - cudf::logger().sinks() = prev_sinks; + cudf::detail::logger().set_level(prev_level); + cudf::detail::logger().sinks() = prev_sinks; } void clear_sink() { oss.str(""); } @@ -46,32 +47,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::logger().critical("crit msg"); + cudf::detail::logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); - cudf::logger().info("info"); - cudf::logger().warn("warn"); - cudf::logger().error("error"); - cudf::logger().critical("critical"); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); + cudf::detail::logger().error("error"); + cudf::detail::logger().critical("critical"); ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::logger().set_level(spdlog::level::warn); - cudf::logger().info("info"); - cudf::logger().warn("warn"); + cudf::detail::logger().set_level(spdlog::level::warn); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::logger().set_level(spdlog::level::debug); - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); + cudf::detail::logger().set_level(spdlog::level::debug); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); } diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp index ae7c6fa8b8c..7b8ee840da4 100644 --- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/utilities/hostdevice_vector.hpp" + #include #include #include @@ -22,10 +24,17 @@ #include #include #include +#include #include #include +using cudf::host_span; +using cudf::detail::host_2dspan; +using cudf::detail::hostdevice_2dvector; +using cudf::detail::hostdevice_span; +using cudf::detail::hostdevice_vector; + class PinnedMemoryTest : public cudf::test::BaseFixture { size_t prev_copy_threshold; size_t prev_alloc_threshold; @@ -125,3 +134,63 @@ TEST_F(PinnedMemoryTest, MakeHostVector) EXPECT_FALSE(vec.get_allocator().is_device_accessible()); } } + +TEST_F(PinnedMemoryTest, HostSpan) +{ + auto test_ctors = [](auto&& vec) { + auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible(); + // Test conversion from a vector + auto const span = host_span{vec}; + EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible); + // Test conversion from host_span with different type + auto const span_converted = host_span{span}; + EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible); + }; + + cudf::set_allocate_host_as_pinned_threshold(7); + for (int i = 1; i < 10; i++) { + // some iterations will use pinned memory, some will not + test_ctors(cudf::detail::make_host_vector(i, cudf::get_default_stream())); + } + + auto stream{cudf::get_default_stream()}; + + // hostdevice vectors use pinned memory for the host side; test that host_span can be constructed + // from a hostdevice_vector with correct device accessibility + + hostdevice_vector hd_vec(10, stream); + auto const span = host_span{hd_vec}; + EXPECT_TRUE(span.is_device_accessible()); + + // test host_view and operator[] + { + hostdevice_2dvector hd_2dvec(10, 10, stream); + auto const span2d = hd_2dvec.host_view().flat_view(); + EXPECT_TRUE(span2d.is_device_accessible()); + + auto const span2d_from_cast = host_2dspan{hd_2dvec}; + EXPECT_TRUE(span2d_from_cast.flat_view().is_device_accessible()); + + auto const row_span = hd_2dvec[0]; + EXPECT_TRUE(row_span.is_device_accessible()); + } + + // test const versions of host_view and operator[] + { + hostdevice_2dvector const const_hd_2dvec(10, 10, stream); + auto const const_span2d = const_hd_2dvec.host_view().flat_view(); + EXPECT_TRUE(const_span2d.is_device_accessible()); + + auto const const_span2d_from_cast = host_2dspan{const_hd_2dvec}; + EXPECT_TRUE(const_span2d_from_cast.flat_view().is_device_accessible()); + + auto const const_row_span = const_hd_2dvec[0]; + EXPECT_TRUE(const_row_span.is_device_accessible()); + } + + // test hostdevice_span + { + hostdevice_span hd_span(hd_vec); + EXPECT_TRUE(host_span{hd_span}.is_device_accessible()); + } +} diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu index 019d6adc007..5389e1c069d 100644 --- a/cpp/tests/utilities_tests/span_tests.cu +++ b/cpp/tests/utilities_tests/span_tests.cu @@ -336,58 +336,50 @@ auto get_test_hostdevice_vector() TEST(HostDeviceSpanTest, CanCreateFullSubspan) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; - expect_equivalent(message_span, message.subspan(0, message_span.size())); + expect_equivalent(message_span.subspan(0, message_span.size()), message_span); } TEST(HostDeviceSpanTest, CanCreateHostSpan) { auto message = get_test_hostdevice_vector(); auto const message_span = host_span(message.host_ptr(), message.size()); - auto const hd_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto const hd_span = cudf::detail::hostdevice_span{message}; expect_equivalent(message_span, cudf::host_span(hd_span)); } TEST(HostDeviceSpanTest, CanTakeSubspanFull) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; - expect_match("hello world", message.subspan(0, 11)); expect_match("hello world", message_span.subspan(0, 11)); } TEST(HostDeviceSpanTest, CanTakeSubspanPartial) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; - expect_match("lo w", message.subspan(3, 4)); expect_match("lo w", message_span.subspan(3, 4)); } TEST(HostDeviceSpanTest, CanGetData) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; EXPECT_EQ(message.host_ptr(), message_span.host_ptr()); } TEST(HostDeviceSpanTest, CanGetSize) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); - auto const empty_span = cudf::detail::hostdevice_span(); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; + auto const empty_span = cudf::detail::hostdevice_span(); EXPECT_EQ(static_cast(11), message_span.size()); EXPECT_EQ(static_cast(0), empty_span.size()); @@ -413,8 +405,7 @@ TEST(HostDeviceSpanTest, CanCopySpan) cudf::detail::hostdevice_span message_span_copy; { - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto const message_span = cudf::detail::hostdevice_span{message}; message_span_copy = message_span; } diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp index fecb896f95a..c1c5776be74 100644 --- a/cpp/tests/utilities_tests/type_check_tests.cpp +++ b/cpp/tests/utilities_tests/type_check_tests.cpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp index 849457056e4..6c3a84763a0 100644 --- a/cpp/tests/utilities_tests/type_list_tests.cpp +++ b/cpp/tests/utilities_tests/type_list_tests.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include using namespace cudf::test; // this will make reading code way easier @@ -23,6 +22,7 @@ namespace { // Work around to remove parentheses surrounding a type template struct argument_type; + template struct argument_type { using type = U; diff --git a/dependencies.yaml b/dependencies.yaml index e1dda6a92c7..f903e052486 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -6,15 +6,24 @@ files: cuda: ["11.8", "12.5"] arch: [x86_64] includes: + # Note that clang-tidy is not included here because cudf's preferred + # version conflicts with the rest of RAPIDS as well as its own + # clang-format version. Until we update our clang-format version we will + # not be able to install both into the same environment. Moreover, using + # this version will break compatibility with other RAPIDS libraries that + # are still using 16.0.6, and as such will and that would break any + # unified environment like that used in unified devcontainers. - build_base - build_all - build_cpp - build_python_common + - clang_format - cuda - cuda_version - depends_on_cupy - depends_on_libkvikio - depends_on_librmm + - depends_on_nvcomp - depends_on_rmm - develop - docs @@ -32,9 +41,11 @@ files: - test_cpp - test_python_common - test_python_cudf + - test_python_cudf_common - test_python_dask_cudf - test_python_pylibcudf - test_python_cudf_pandas + - test_python_cudf_polars test_static_build: output: none includes: @@ -50,6 +61,7 @@ files: - cuda_version - py_version - test_python_common + - test_python_cudf_common - test_python_cudf - test_python_cudf_pandas test_python_cudf: @@ -58,6 +70,7 @@ files: - cuda_version - py_version - test_python_common + - test_python_cudf_common - test_python_cudf test_python_other: output: none @@ -65,6 +78,7 @@ files: - cuda_version - py_version - test_python_common + - test_python_cudf_common - test_python_dask_cudf test_java: output: none @@ -85,6 +99,16 @@ files: includes: - develop - py_version + clang_tidy: + output: none + includes: + - build_all + - build_base + - clang_tidy + - cuda + - cuda_version + - develop + - py_version docs: output: none includes: @@ -133,6 +157,7 @@ files: key: test includes: - test_python_common + - test_python_cudf_common - test_python_cudf py_build_libcudf: output: pyproject @@ -152,6 +177,14 @@ files: - build_cpp - depends_on_libkvikio - depends_on_librmm + py_run_libcudf: + output: pyproject + pyproject_dir: python/libcudf + extras: + table: project + includes: + - depends_on_libkvikio + - depends_on_nvcomp py_build_pylibcudf: output: pyproject pyproject_dir: python/pylibcudf @@ -189,6 +222,7 @@ files: key: test includes: - test_python_common + - test_python_cudf_common - test_python_pylibcudf py_test_pandas_cudf: output: pyproject @@ -206,7 +240,7 @@ files: key: cudf-pandas-tests includes: - test_python_cudf_pandas - py_rapids_build_cudf_polars: + py_build_cudf_polars: output: pyproject pyproject_dir: python/cudf_polars extras: @@ -221,6 +255,14 @@ files: includes: - run_cudf_polars - depends_on_pylibcudf + py_run_cudf_polars_experimental: + output: pyproject + pyproject_dir: python/cudf_polars + extras: + table: project.optional-dependencies + key: experimental + includes: + - run_cudf_polars_experimental py_test_cudf_polars: output: pyproject pyproject_dir: python/cudf_polars @@ -229,6 +271,7 @@ files: key: test includes: - test_python_common + - test_python_cudf_polars py_build_dask_cudf: output: pyproject pyproject_dir: python/dask_cudf @@ -254,6 +297,7 @@ files: key: test includes: - test_python_common + - test_python_cudf_common - test_python_dask_cudf py_build_cudf_kafka: output: pyproject @@ -286,6 +330,7 @@ files: key: test includes: - test_python_common + - test_python_cudf_common py_build_custreamz: output: pyproject pyproject_dir: python/custreamz @@ -310,6 +355,7 @@ files: key: test includes: - test_python_common + - test_python_cudf_common channels: - rapidsai - rapidsai-nightly @@ -367,9 +413,27 @@ dependencies: - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 - # Align nvcomp version with rapids-cmake - - nvcomp==4.0.1 - spdlog>=1.14.1,<1.15 + depends_on_nvcomp: + common: + - output_types: conda + packages: + # Align nvcomp version with rapids-cmake + - nvcomp==4.1.0.6 + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + packages: + - nvidia-nvcomp-cu12==4.1.0.6 + - matrix: + cuda: "11.*" + packages: + - nvidia-nvcomp-cu11==4.1.0.6 + - matrix: + packages: + - nvidia-nvcomp==4.1.0.6 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] @@ -395,9 +459,18 @@ dependencies: - cython>=3.0.3 pyarrow_run: common: - - output_types: [conda, requirements, pyproject] + - output_types: [conda] + packages: + - pyarrow>=14.0.0,<19.0.0a0 + - output_types: [requirements, pyproject] packages: - - pyarrow>=14.0.0,<18.0.0a0 + # pyarrow 17.0.0 wheels have a subtle issue around threading that + # can cause segmentation faults around imports on arm. It appears to + # be highly dependent on the exact build configuration, so we'll just + # avoid 17.0.0 for now unless we observe similar issues in future + # releases as well. + - pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64' + - pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64' cuda_version: specific: - output_types: conda @@ -518,17 +591,28 @@ dependencies: # pre-commit requires identify minimum version 1.0, but clang-format requires textproto support and that was # added in 2.5.20, so we need to call out the minimum version needed for our plugins - identify>=2.5.20 + - output_types: conda + packages: + - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version. + clang_format: + common: - output_types: conda packages: - clang==16.0.6 - clang-tools=16.0.6 - - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version. + clang_tidy: + common: + - output_types: conda + packages: + - clang==19.1.3 + - clang-tools==19.1.3 + - include-what-you-use==0.23.0 docs: common: - output_types: [conda] packages: - breathe>=4.35.0 - - dask-cuda==24.10.* + - dask-cuda==24.12.*,>=0.0.0a0 - *doxygen - make - myst-nb @@ -576,7 +660,7 @@ dependencies: packages: - fsspec>=0.6.0 - &numpy numpy>=1.23,<3.0a0 - - pandas>=2.0,<2.2.3dev0 + - pandas>=2.0,<2.2.4dev0 run_pylibcudf: common: - output_types: [conda, requirements, pyproject] @@ -595,17 +679,17 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0 + - cuda-python>=12.0,<13.0a0,<=12.6.0 - matrix: {cuda: "11.*"} packages: &run_pylibcudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0 + - cuda-python>=11.7.1,<12.0a0,<=11.8.3 - {matrix: null, packages: *run_pylibcudf_packages_all_cu11} run_cudf: common: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba numba>=0.57 + - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18 - nvtx>=0.2.1 - packaging - rich @@ -621,10 +705,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0 + - cuda-python>=12.0,<13.0a0,<=12.6.0 - matrix: {cuda: "11.*"} packages: &run_cudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0 + - cuda-python>=11.7.1,<12.0a0,<=11.8.3 - {matrix: null, packages: *run_cudf_packages_all_cu11} - output_types: conda matrices: @@ -664,12 +748,18 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.8,<1.9 + - polars>=1.11,<1.15 + run_cudf_polars_experimental: + common: + - output_types: [conda, requirements, pyproject] + packages: + - rapids-dask-dependency==24.12.*,>=0.0.0a0 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] packages: - - rapids-dask-dependency==24.10.* + - pynvml>=11.4.1,<12.0.0a0 + - rapids-dask-dependency==24.12.*,>=0.0.0a0 run_custreamz: common: - output_types: conda @@ -714,14 +804,19 @@ dependencies: - pytest<8 - pytest-cov - pytest-xdist + test_python_cudf_common: specific: # Define additional constraints for testing with oldest dependencies. - output_types: [conda, requirements] matrices: - matrix: {dependencies: "oldest"} packages: - - numba==0.57.* + - *numba-cuda-dep - pandas==2.0.* + - matrix: {dependencies: "latest"} + packages: + - numba-cuda==0.0.15 + - pandas==2.2.3 - matrix: packages: - output_types: conda @@ -761,6 +856,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - scipy + - mmh3 - output_types: conda packages: - aiobotocore>=2.2.0 @@ -769,12 +865,14 @@ dependencies: - msgpack-python - moto>=4.0.8 - s3fs>=2022.3.0 - - output_types: pyproject + - python-xxhash + - output_types: [pyproject, requirements] packages: - msgpack - &tokenizers tokenizers==0.15.2 - &transformers transformers==4.39.3 - tzdata + - xxhash specific: - output_types: [conda, requirements] matrices: @@ -801,8 +899,8 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==24.10.* - - *numba + - dask-cuda==24.12.*,>=0.0.0a0 + - *numba-cuda-dep specific: - output_types: [conda, requirements] matrices: @@ -812,11 +910,16 @@ dependencies: - pyarrow==14.0.1 - matrix: packages: + test_python_cudf_polars: + common: + - output_types: [conda, requirements, pyproject] + packages: + - *numpy depends_on_libcudf: common: - output_types: conda packages: - - &libcudf_unsuffixed libcudf==24.10.* + - &libcudf_unsuffixed libcudf==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -830,18 +933,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libcudf-cu12==24.10.* + - libcudf-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - libcudf-cu11==24.10.* + - libcudf-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*libcudf_unsuffixed]} depends_on_pylibcudf: common: - output_types: conda packages: - - &pylibcudf_unsuffixed pylibcudf==24.10.* + - &pylibcudf_unsuffixed pylibcudf==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -855,18 +958,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pylibcudf-cu12==24.10.* + - pylibcudf-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - pylibcudf-cu11==24.10.* + - pylibcudf-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*pylibcudf_unsuffixed]} depends_on_cudf: common: - output_types: conda packages: - - &cudf_unsuffixed cudf==24.10.* + - &cudf_unsuffixed cudf==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -880,18 +983,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf-cu12==24.10.* + - cudf-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - cudf-cu11==24.10.* + - cudf-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_unsuffixed]} depends_on_cudf_kafka: common: - output_types: conda packages: - - &cudf_kafka_unsuffixed cudf_kafka==24.10.* + - &cudf_kafka_unsuffixed cudf_kafka==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -905,12 +1008,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf_kafka-cu12==24.10.* + - cudf_kafka-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - cudf_kafka-cu11==24.10.* + - cudf_kafka-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_kafka_unsuffixed]} depends_on_cupy: common: @@ -931,7 +1034,7 @@ dependencies: common: - output_types: conda packages: - - &libkvikio_unsuffixed libkvikio==24.10.* + - &libkvikio_unsuffixed libkvikio==24.12.*,>=0.0.0a0 - output_types: requirements packages: - --extra-index-url=https://pypi.nvidia.com @@ -943,12 +1046,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libkvikio-cu12==24.10.* + - libkvikio-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - libkvikio-cu11==24.10.* + - libkvikio-cu11==24.12.*,>=0.0.0a0 - matrix: packages: - *libkvikio_unsuffixed @@ -956,7 +1059,7 @@ dependencies: common: - output_types: conda packages: - - &librmm_unsuffixed librmm==24.10.* + - &librmm_unsuffixed librmm==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -970,12 +1073,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - librmm-cu12==24.10.* + - librmm-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - librmm-cu11==24.10.* + - librmm-cu11==24.12.*,>=0.0.0a0 - matrix: packages: - *librmm_unsuffixed @@ -983,7 +1086,7 @@ dependencies: common: - output_types: conda packages: - - &rmm_unsuffixed rmm==24.10.* + - &rmm_unsuffixed rmm==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -997,12 +1100,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - rmm-cu12==24.10.* + - rmm-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - rmm-cu11==24.10.* + - rmm-cu11==24.12.*,>=0.0.0a0 - matrix: packages: - *rmm_unsuffixed diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 95813907bf4..fbb9ca4b128 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -26,16 +26,18 @@ import tempfile import warnings import xml.etree.ElementTree as ET +from enum import IntEnum +from typing import Any +import cudf from docutils.nodes import Text from packaging.version import Version -from sphinx.addnodes import pending_xref -from sphinx.highlighting import lexers -from sphinx.ext import intersphinx from pygments.lexer import RegexLexer from pygments.token import Text as PText - -import cudf +from sphinx.addnodes import pending_xref +from sphinx.ext import intersphinx +from sphinx.ext.autodoc import ClassDocumenter, bool_option +from sphinx.highlighting import lexers class PseudoLexer(RegexLexer): @@ -342,7 +344,10 @@ def clean_all_xml_files(path): "cudf.Series": ("cudf.core.series.Series", "cudf.Series"), "cudf.Index": ("cudf.core.index.Index", "cudf.Index"), "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"), - "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), + "DeviceBuffer": ( + "rmm.pylibrmm.device_buffer.DeviceBuffer", + "rmm.DeviceBuffer", + ), } @@ -373,7 +378,14 @@ def _generate_namespaces(namespaces): _all_namespaces = _generate_namespaces( { # Note that io::datasource is actually a nested class - "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"}, + "cudf": { + "io", + "io::datasource", + "strings", + "ast", + "ast::expression", + "io::text", + }, "numeric": {}, "nvtext": {}, } @@ -554,6 +566,8 @@ def on_missing_reference(app, env, node, contnode): nitpick_ignore = [ + # Erroneously warned in ParquetColumnSchema.name + ("py:class", "unicode"), ("py:class", "SeriesOrIndex"), ("py:class", "Dtype"), # The following are erroneously warned due to @@ -640,9 +654,54 @@ def linkcode_resolve(domain, info) -> str | None: f"branch-{version}/python/cudf/cudf/{fn}{linespec}" ) + # Needed for avoid build warning for PandasCompat extension suppress_warnings = ["myst.domains"] + +class PLCIntEnumDocumenter(ClassDocumenter): + objtype = "enum" + directivetype = "attribute" + priority = 10 + ClassDocumenter.priority + + option_spec = dict(ClassDocumenter.option_spec) + + @classmethod + def can_document_member( + cls, member: Any, membername: str, isattr: bool, parent: Any + ) -> bool: + try: + return issubclass( + member, IntEnum + ) and member.__module__.startswith("pylibcudf") + except TypeError: + return False + + def add_directive_header(self, sig: str) -> None: + self.directivetype = "attribute" + super().add_directive_header(sig) + + def add_content(self, more_content) -> None: + doc_as_attr = self.doc_as_attr + self.doc_as_attr = False + super().add_content(more_content) + self.doc_as_attr = doc_as_attr + source_name = self.get_sourcename() + enum_object: IntEnum = self.object + + if self.object.__name__ != "Kind": + self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name) + self.add_line("", source_name) + self.add_line("Enum members", source_name) + self.add_line("", source_name) + + for the_member_name in enum_object.__members__: # type: ignore[attr-defined] + self.add_line( + f"* ``{the_member_name}``", source_name + ) + self.add_line("", source_name) + + def setup(app): app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") app.add_js_file( @@ -650,3 +709,5 @@ def setup(app): ) app.connect("doctree-read", resolve_aliases) app.connect("missing-reference", on_missing_reference) + app.setup_extension("sphinx.ext.autodoc") + app.add_autodocumenter(PLCIntEnumDocumenter) diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index 34b657488c1..5024747227e 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of: ``` - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version 24.02 of cudf was the last to support pandas 1.5.x. +- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy + array, we create a proxy type that actually subclasses `numpy.ndarray`. We can + verify this with an isinstance check. + + ```python + %load_ext cudf.pandas + import pandas as pd + import numpy as np + + arr = pd.Series([1, 1, 2]).unique() # returns a proxy array + isinstance(arr, np.ndarray) # returns True, where arr is a proxy array + ``` + Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to + access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API. + However, our proxy mechanism is designed to proxy function calls at the Python + level, which is incompatible with these types of accesses. To handle these + situations, we perform an eager device-to-host (DtoH) copy, which sets the data + buffer correctly but incurs the cost of extra time when creating the proxy array. + In the previous example, creating `arr` performed this kind of implicit DtoH transfer. + + With this approach, we also get compatibility with third party libraries like `torch`. + + ```python + import torch + x = torch.from_numpy(arr) + ``` ## Can I force running on the CPU? diff --git a/docs/cudf/source/cudf_polars/engine_options.md b/docs/cudf/source/cudf_polars/engine_options.md new file mode 100644 index 00000000000..afb2bb6e8b9 --- /dev/null +++ b/docs/cudf/source/cudf_polars/engine_options.md @@ -0,0 +1,32 @@ +# GPUEngine Configuration Options + +The `polars.GPUEngine` object may be configured in several different ways. + +## Parquet Reader Options +Reading large parquet files can use a large amount of memory, especially when the files are compressed. This may lead to out of memory errors for some workflows. To mitigate this, the "chunked" parquet reader may be selected. When enabled, parquet files are read in chunks, limiting the peak memory usage at the cost of a small drop in performance. + + +To configure the parquet reader, we provide a dictionary of options to the `parquet_options` keyword of the `GPUEngine` object. Valid keys and values are: +- `chunked` indicates that chunked parquet reading is to be used. By default, chunked reading is turned on. +- [`chunk_read_limit`](https://docs.rapids.ai/api/libcudf/legacy/classcudf_1_1io_1_1chunked__parquet__reader#aad118178b7536b7966e3325ae1143a1a) controls the maximum size per chunk. By default, the maximum chunk size is unlimited. +- [`pass_read_limit`](https://docs.rapids.ai/api/libcudf/legacy/classcudf_1_1io_1_1chunked__parquet__reader#aad118178b7536b7966e3325ae1143a1a) controls the maximum memory used for decompression. The default pass read limit is 16GiB. + +For example, to select the chunked reader with custom values for `pass_read_limit` and `chunk_read_limit`: +```python +engine = GPUEngine( + parquet_options={ + 'chunked': True, + 'chunk_read_limit': int(1e9), + 'pass_read_limit': int(4e9) + } +) +result = query.collect(engine=engine) +``` +Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect. + +## Disabling CUDA Managed Memory + +By default `cudf_polars` will default to [CUDA managed memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory-introduction) with RMM's pool allocator. On systems that don't support managed memory, a non-managed asynchronous pool +allocator is used. +Managed memory can be turned off by setting `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` to `0`. System requirements for managed memory can be found [here]( +https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory). diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst index 0a3a0d86b2c..a9b4bb2dff2 100644 --- a/docs/cudf/source/cudf_polars/index.rst +++ b/docs/cudf/source/cudf_polars/index.rst @@ -9,6 +9,12 @@ and run on the CPU. Benchmark --------- + +.. note:: + The following benchmarks were performed with `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` environment variable set to `"0"`. + Using managed memory (the default) imposes a performance cost in order to avoid out of memory errors. + Peak performance can still be attained by setting the environment variable to 1. + We reproduced the `Polars Decision Support (PDS) `__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results: .. figure:: ../_static/pds_benchmark_polars.png @@ -39,3 +45,9 @@ Launch on Google Colab :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab `__. + +.. toctree:: + :maxdepth: 1 + :caption: Engine Config Options: + + engine_options diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md index 6fce268f309..f4d2c7319b3 100644 --- a/docs/cudf/source/developer_guide/contributing_guide.md +++ b/docs/cudf/source/developer_guide/contributing_guide.md @@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting. Specifically, cuDF uses the following tools: -- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance. -- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently. +- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance. - [`mypy`](http://mypy-lang.org/) performs static type checking. In conjunction with [type hints](https://docs.python.org/3/library/typing.html), `mypy` can help catch various bugs that are otherwise difficult to find. diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index a8a6d81d6fb..b653b786129 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -11,7 +11,8 @@ In the rest of this document, to maintain a concrete pair of libraries in mind, For example, future support could include pairs such as CuPy (as the "fast" library) and NumPy (as the "slow" library). ```{note} -We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +1. We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +2. There is a `custom_iter` method defined to always utilize slow objects `iter` method, that way we don't move the objects to GPU and trigger an error and again move the object to CPU to execute the iteration successfully. ``` ### Types: @@ -136,3 +137,21 @@ Arrays are not almost equal to 7 decimals ACTUAL: 1.0 DESIRED: 2.0. ``` + +Setting the environment variable `CUDF_PANDAS_FAIL_ON_FALLBACK` causes `cudf.pandas` to fail when falling back from cuDF to Pandas. +For example, +```python +import cudf.pandas +cudf.pandas.install() +import pandas as pd +import numpy as np + +df = pd.DataFrame({ + 'complex_col': [1 + 2j, 3 + 4j, 5 + 6j] +}) + +print(df) +``` +``` +ProxyFallbackError: The operation failed with cuDF, the reason was : Series with Complex128DType is not supported. +``` diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md index 39840e72e21..1ee828e7c4e 100644 --- a/docs/cudf/source/developer_guide/pylibcudf.md +++ b/docs/cudf/source/developer_guide/pylibcudf.md @@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing. - All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards. - Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies. - +- Type stubs are provided and generated manually. When adding new + functionality, ensure that the matching type stub is appropriately updated. ## Relationship to libcudf @@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`. Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side. + +### Type stubs + +Since static type checkers like `mypy` and `pyright` cannot parse +Cython code, we provide type stubs for the pylibcudf package. These +are currently maintained manually, alongside the matching pylibcudf +files. + +Every `pyx` file should have a matching `pyi` file that provides the +type stubs. Most functions can be exposed straightforwardly. Some +guiding principles: + +- For typed integer arguments in libcudf, use `int` as a type + annotation. +- For functions which are annotated as a `list` in Cython, but the + function body does more detailed checking, try and encode the + detailed information in the type. +- For Cython fused types there are two options: + 1. If the fused type appears only once in the function signature, + use a `Union` type; + 2. If the fused type appears more than once (or as both an input + and output type), use a `TypeVar` with + the variants in the fused type provided as constraints. + + +As an example, `pylibcudf.copying.split` is typed in Cython as: + +```cython +ctypedef fused ColumnOrTable: + Table + Column + +cpdef list split(ColumnOrTable input, list splits): ... +``` + +Here we only have a single use of the fused type, and the `list` +arguments do not specify their values. Here, if we provide a `Column` +as input, we receive a `list[Column]` as output, and if we provide a +`Table` we receive `list[Table]` as output. + +In the type stub, we can encode this with a `TypeVar`, we can also +provide typing for the `splits` argument that indicates that the split +values must be integers: + +```python +ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) + +def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +``` + +Conversely, `pylibcudf.copying.scatter` uses a fused type only once in +its input: + +```cython +ctypedef fused TableOrListOfScalars: + Table + list + +cpdef Table scatter( + TableOrListOfScalars source, Column scatter_map, Table target +) +``` + +In the type stub, we can use a normal union in this case + +```python +def scatter( + source: Table | list[Scalar], scatter_map: Column, target: Table +) -> Table: ... +``` diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md index f12f809d5db..22cc1b5b8de 100644 --- a/docs/cudf/source/developer_guide/testing.md +++ b/docs/cudf/source/developer_guide/testing.md @@ -7,6 +7,23 @@ specifically the [`pytest-cov`](https://github.com/pytest-dev/pytest-cov) plugin Code coverage reports are uploaded to [Codecov](https://app.codecov.io/gh/rapidsai/cudf). Each PR also indicates whether it increases or decreases test coverage. +### Configuring pytest + +Pytest will accept configuration in [multiple different +files](https://docs.pytest.org/en/stable/reference/customize.html), +with a specified discovery and precedence order. Note in particular +that there is no automatic "include" mechanism, as soon as a matching +configuration file is found, discovery stops. + +For preference, so that all tool configuration lives in the same +place, we use `pyproject.toml`-based configuration. Test configuration +for a given package should live in that package's `pyproject.toml` +file. + +Where tests do not naturally belong to a project, for example the +`cudf.pandas` integration tests and the cuDF benchmarks, use a +`pytest.ini` file as close to the tests as possible. + ## Test organization How tests are organized depends on which of the following two groups they fall into: diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index 95f5f9734dd..46221b6015b 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -38,10 +38,10 @@ "import os\n", "\n", "import cupy as cp\n", + "import dask_cudf\n", "import pandas as pd\n", "\n", "import cudf\n", - "import dask_cudf\n", "\n", "cp.random.seed(12)\n", "\n", diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst index d05501f4a4a..f711327f9ed 100644 --- a/docs/cudf/source/user_guide/api_docs/index.rst +++ b/docs/cudf/source/user_guide/api_docs/index.rst @@ -19,7 +19,7 @@ This page provides a list of all publicly accessible modules, methods and classe general_utilities window io - subword_tokenize + tokenize_vocabulary string_handling list_handling struct_handling diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst new file mode 100644 index 00000000000..6bd1fbd821b --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst @@ -0,0 +1,6 @@ +======= +hashing +======= + +.. automodule:: pylibcudf.hashing + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e21536e2e97..997ece6d29c 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -19,8 +19,10 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby + hashing interop join + json labeling lists merge @@ -49,3 +51,4 @@ This page provides API documentation for pylibcudf. io/index.rst strings/index.rst + nvtext/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index 53638f071cc..1c1c8040972 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,4 +19,6 @@ I/O Functions csv json parquet + parquet_metadata + text timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst new file mode 100644 index 00000000000..fce964f9714 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst @@ -0,0 +1,6 @@ +================ +Parquet Metadata +================ + +.. automodule:: pylibcudf.io.parquet_metadata + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst new file mode 100644 index 00000000000..327ca043f36 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst @@ -0,0 +1,6 @@ +==== +text +==== + +.. automodule:: pylibcudf.io.text + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst new file mode 100644 index 00000000000..bb38d179a57 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst @@ -0,0 +1,6 @@ +==== +json +==== + +.. automodule:: pylibcudf.json + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst new file mode 100644 index 00000000000..908fcc4fde6 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst @@ -0,0 +1,6 @@ +================ +byte_pair_encode +================ + +.. automodule:: pylibcudf.nvtext.byte_pair_encode + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst new file mode 100644 index 00000000000..abb45e426a8 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst @@ -0,0 +1,6 @@ +============= +edit_distance +============= + +.. automodule:: pylibcudf.nvtext.edit_distance + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst new file mode 100644 index 00000000000..d68199271bd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst @@ -0,0 +1,6 @@ +=============== +generate_ngrams +=============== + +.. automodule:: pylibcudf.nvtext.generate_ngrams + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst new file mode 100644 index 00000000000..9ba47fd8d70 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -0,0 +1,17 @@ +nvtext +====== + +.. toctree:: + :maxdepth: 1 + + edit_distance + generate_ngrams + jaccard + minhash + byte_pair_encode + ngrams_tokenize + normalize + replace + stemmer + subword_tokenize + tokenize diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst new file mode 100644 index 00000000000..ea59657c25e --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst @@ -0,0 +1,6 @@ +======= +jaccard +======= + +.. automodule:: pylibcudf.nvtext.jaccard + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst new file mode 100644 index 00000000000..b8ec02fca35 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst @@ -0,0 +1,6 @@ +======= +minhash +======= + +.. automodule:: pylibcudf.nvtext.minhash + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst new file mode 100644 index 00000000000..ce6db76f889 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst @@ -0,0 +1,6 @@ +=============== +ngrams_tokenize +=============== + +.. automodule:: pylibcudf.nvtext.ngrams_tokenize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst new file mode 100644 index 00000000000..e496f6a45da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst @@ -0,0 +1,6 @@ +========= +normalize +========= + +.. automodule:: pylibcudf.nvtext.normalize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst new file mode 100644 index 00000000000..04cee972dc1 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst @@ -0,0 +1,6 @@ +======= +replace +======= + +.. automodule:: pylibcudf.nvtext.replace + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst new file mode 100644 index 00000000000..b407ff8451a --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst @@ -0,0 +1,6 @@ +======= +stemmer +======= + +.. automodule:: pylibcudf.nvtext.stemmer + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst new file mode 100644 index 00000000000..818714bec6a --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst @@ -0,0 +1,6 @@ +================ +subword_tokenize +================ + +.. automodule:: pylibcudf.nvtext.subword_tokenize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst new file mode 100644 index 00000000000..85c5a27b09d --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst @@ -0,0 +1,6 @@ +======== +tokenize +======== + +.. automodule:: pylibcudf.nvtext.tokenize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst new file mode 100644 index 00000000000..38a46641200 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst @@ -0,0 +1,6 @@ +======= +combine +======= + +.. automodule:: pylibcudf.strings.combine + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst new file mode 100644 index 00000000000..de62221456f --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst @@ -0,0 +1,6 @@ +================ +convert_booleans +================ + +.. automodule:: pylibcudf.strings.convert.convert_booleans + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst new file mode 100644 index 00000000000..fc5d5204ab3 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst @@ -0,0 +1,6 @@ +================ +convert_datetime +================ + +.. automodule:: pylibcudf.strings.convert.convert_datetime + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst new file mode 100644 index 00000000000..e80b0c15a61 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst @@ -0,0 +1,6 @@ +================= +convert_durations +================= + +.. automodule:: pylibcudf.strings.convert.convert_durations + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst new file mode 100644 index 00000000000..16d971a6849 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst @@ -0,0 +1,6 @@ +=================== +convert_fixed_point +=================== + +.. automodule:: pylibcudf.strings.convert.convert_fixed_point + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst new file mode 100644 index 00000000000..9ae4004cea9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst @@ -0,0 +1,6 @@ +============== +convert_floats +============== + +.. automodule:: pylibcudf.strings.convert.convert_floats + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst new file mode 100644 index 00000000000..71d146c0379 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst @@ -0,0 +1,6 @@ +================ +convert_integers +================ + +.. automodule:: pylibcudf.strings.convert.convert_integers + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst new file mode 100644 index 00000000000..4ead8677a69 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst @@ -0,0 +1,6 @@ +============ +convert_ipv4 +============ + +.. automodule:: pylibcudf.strings.convert.convert_ipv4 + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst new file mode 100644 index 00000000000..33a719a42e1 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst @@ -0,0 +1,6 @@ +============= +convert_lists +============= + +.. automodule:: pylibcudf.strings.convert.convert_lists + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst new file mode 100644 index 00000000000..f20d95e0cdd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst @@ -0,0 +1,6 @@ +============ +convert_urls +============ + +.. automodule:: pylibcudf.strings.convert.convert_urls + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst new file mode 100644 index 00000000000..3d07c1271b4 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst @@ -0,0 +1,15 @@ +convert +======= + +.. toctree:: + :maxdepth: 1 + + convert_booleans + convert_datetime + convert_durations + convert_fixed_point + convert_floats + convert_integers + convert_ipv4 + convert_lists + convert_urls diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst new file mode 100644 index 00000000000..8e86b33b1a0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst @@ -0,0 +1,6 @@ +============= +find_multiple +============= + +.. automodule:: pylibcudf.strings.find_multiple + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst index 9850ee10098..699e38ebbe5 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst @@ -1,6 +1,6 @@ -==== -find -==== +======= +findall +======= .. automodule:: pylibcudf.strings.findall :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 9b1a6b72a88..ae670b5bd8a 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -6,13 +6,26 @@ strings capitalize char_types + combine contains extract find + find_multiple findall + padding regex_flags regex_program repeat + replace_re replace + side_type slice + split strip + wrap + +.. toctree:: + :maxdepth: 2 + :caption: Subpackages + + convert/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst new file mode 100644 index 00000000000..5b417024fd5 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst @@ -0,0 +1,6 @@ +======= +padding +======= + +.. automodule:: pylibcudf.strings.padding + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst new file mode 100644 index 00000000000..5bf715ef657 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst @@ -0,0 +1,6 @@ +========== +replace_re +========== + +.. automodule:: pylibcudf.strings.replace_re + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst new file mode 100644 index 00000000000..d5aef9c4f75 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst @@ -0,0 +1,6 @@ +========= +side_type +========= + +.. automodule:: pylibcudf.strings.side_type + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst new file mode 100644 index 00000000000..cba96e86f45 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst @@ -0,0 +1,6 @@ +===== +split +===== + +.. automodule:: pylibcudf.strings.split + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst new file mode 100644 index 00000000000..bd825f78568 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst @@ -0,0 +1,6 @@ +==== +wrap +==== + +.. automodule:: pylibcudf.strings.wrap + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst index e39ca18a12b..4de9bced86f 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst @@ -1,5 +1,5 @@ ===== -Table +table ===== .. automodule:: pylibcudf.table diff --git a/docs/cudf/source/user_guide/api_docs/string_handling.rst b/docs/cudf/source/user_guide/api_docs/string_handling.rst index ab0f085e1a6..91d3e33960b 100644 --- a/docs/cudf/source/user_guide/api_docs/string_handling.rst +++ b/docs/cudf/source/user_guide/api_docs/string_handling.rst @@ -60,6 +60,7 @@ strings and apply several methods to it. These can be accessed like isupper istimestamp istitle + jaccard_index join len like @@ -67,6 +68,7 @@ strings and apply several methods to it. These can be accessed like lower lstrip match + minhash ngrams ngrams_tokenize normalize_characters @@ -90,7 +92,6 @@ strings and apply several methods to it. These can be accessed like slice_from slice_replace split - rsplit startswith strip swapcase diff --git a/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst deleted file mode 100644 index cd240fe4db4..00000000000 --- a/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst +++ /dev/null @@ -1,12 +0,0 @@ -================ -SubwordTokenizer -================ -.. currentmodule:: cudf.core.subword_tokenizer - -Constructor -~~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - SubwordTokenizer - SubwordTokenizer.__call__ diff --git a/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst b/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst new file mode 100644 index 00000000000..1b5c965f3c9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst @@ -0,0 +1,12 @@ +================== +TokenizeVocabulary +================== +.. currentmodule:: cudf.core.tokenize_vocabulary + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + TokenizeVocabulary + TokenizeVocabulary.tokenize diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index 75eafcc5387..abfe5a1b178 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -101,6 +101,8 @@ "outputs": [], "source": [ "# define a scalar function\n", + "\n", + "\n", "def f(x):\n", " return x + 1" ] @@ -247,6 +249,8 @@ "outputs": [], "source": [ "# redefine the same function from above\n", + "\n", + "\n", "def f(x):\n", " return x + 1" ] @@ -1622,6 +1626,8 @@ "outputs": [], "source": [ "# a user defined aggregation function.\n", + "\n", + "\n", "def udaf(df):\n", " return df[\"b\"].max() - df[\"b\"].min() / 2" ] diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md index 97b961b455b..7d863d890e2 100644 --- a/docs/cudf/source/user_guide/io/io.md +++ b/docs/cudf/source/user_guide/io/io.md @@ -91,16 +91,28 @@ SDK is available for download [here](https://developer.nvidia.com/gpudirect-storage). GDS is also included in CUDA Toolkit 11.4 and higher. -Use of GPUDirect Storage in cuDF is enabled by default, but can be -disabled through the environment variable `LIBCUDF_CUFILE_POLICY`. +Use of GPUDirect Storage in cuDF is disabled by default, but can be +enabled through the environment variable `LIBCUDF_CUFILE_POLICY`. This variable also controls the GDS compatibility mode. There are four valid values for the environment variable: -- "GDS": Enable GDS use; GDS compatibility mode is *off*. -- "ALWAYS": Enable GDS use; GDS compatibility mode is *on*. -- "KVIKIO": Enable GDS through [KvikIO](https://github.com/rapidsai/kvikio). -- "OFF": Completely disable GDS use. +- "GDS": Enable GDS use. If the cuFile library cannot be properly loaded, +fall back to the GDS compatibility mode. +- "ALWAYS": Enable GDS use. If the cuFile library cannot be properly loaded, +throw an exception. +- "KVIKIO": Enable GDS compatibility mode through [KvikIO](https://github.com/rapidsai/kvikio). +Note that KvikIO also provides the environment variable `KVIKIO_COMPAT_MODE` for GDS +control that may alter the effect of "KVIKIO" option in cuDF: + - By default, `KVIKIO_COMPAT_MODE` is unset. In this case, cuDF enforces + the GDS compatibility mode, and the system configuration check for GDS I/O + is never performed. + - If `KVIKIO_COMPAT_MODE=ON`, this is the same with the above case. + - If `KVIKIO_COMPAT_MODE=OFF`, KvikIO enforces GDS I/O without system + configuration check, and will error out if GDS requirements are not met. The + only exceptional case is that if the system does not support files being + opened with the `O_DIRECT` flag, the GDS compatibility mode will be used. +- "OFF": Completely disable GDS and kvikIO use. If no value is set, behavior will be the same as the "KVIKIO" option. @@ -182,3 +194,13 @@ If no value is set, behavior will be the same as the "STABLE" option. +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ ``` + +## Low Memory Considerations + +By default, cuDF's parquet and json readers will try to read the entire file in one pass. This can cause problems when dealing with large datasets or when running workloads on GPUs with limited memory. + +To better support low memory systems, cuDF provides a "low-memory" reader for parquet and json files. This low memory reader processes data in chunks, leading to lower peak memory usage due to the smaller size of intermediate allocations. + +To read a parquet or json file in low memory mode, there are [cuDF options](https://docs.rapids.ai/api/cudf/nightly/user_guide/api_docs/options/#api-options) that must be set globally prior to calling the reader. To set those options, call: +- `cudf.set_option("io.parquet.low_memory", True)` for parquet files, or +- `cudf.set_option("io.json.low_memory", True)` for json files. diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky index 152af22f7e4..9f3305278cb 100644 --- a/java/ci/Dockerfile.rocky +++ b/java/ci/Dockerfile.rocky @@ -33,7 +33,7 @@ RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-${TOOLSET_VERS RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids # 3.22.3+: CUDA architecture 'native' support + flexible CMAKE__*_LAUNCHER for ccache -ARG CMAKE_VERSION=3.26.4 +ARG CMAKE_VERSION=3.28.6 # default x86_64 from x86 build, aarch64 cmake for arm build ARG CMAKE_ARCH=x86_64 RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \ diff --git a/java/ci/README.md b/java/ci/README.md index 5aa2da9a1e5..95b93698cae 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.10 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.12 ``` ### Build cuDF jar with devtoolset @@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-24.10.1-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-24.12.0-SNAPSHOT-cuda11.jar. diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index 5a429bdc739..b85c215d7d1 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -64,7 +64,8 @@ cmake .. -G"${CMAKE_GENERATOR}" \ -DBUILD_TESTS=$BUILD_CPP_TESTS \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \ -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \ - -DBUILD_SHARED_LIBS=OFF + -DBUILD_SHARED_LIBS=OFF \ + -DCUDF_KVIKIO_REMOTE_IO=OFF if [[ -z "${PARALLEL_LEVEL}" ]]; then cmake --build . diff --git a/java/pom.xml b/java/pom.xml index bcbf0af8564..450cfbdbc84 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 24.10.1-SNAPSHOT + 24.12.0-SNAPSHOT cudfjni diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 6bd4e06c47e..b9239af0c1b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -917,6 +917,16 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co // DATE/TIME ///////////////////////////////////////////////////////////////////////////// + /** + * Extract a particular date time component from a timestamp. + * @param component what should be extracted + * @return a column with the extracted information in it. + */ + public final ColumnVector extractDateTimeComponent(DateTimeComponent component) { + assert type.isTimestampType(); + return new ColumnVector(extractDateTimeComponent(getNativeView(), component.getNativeId())); + } + /** * Get year from a timestamp. *

@@ -925,8 +935,7 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co * @return - A new INT16 vector allocated on the GPU. */ public final ColumnVector year() { - assert type.isTimestampType(); - return new ColumnVector(year(getNativeView())); + return extractDateTimeComponent(DateTimeComponent.YEAR); } /** @@ -937,8 +946,7 @@ public final ColumnVector year() { * @return - A new INT16 vector allocated on the GPU. */ public final ColumnVector month() { - assert type.isTimestampType(); - return new ColumnVector(month(getNativeView())); + return extractDateTimeComponent(DateTimeComponent.MONTH); } /** @@ -949,8 +957,7 @@ public final ColumnVector month() { * @return - A new INT16 vector allocated on the GPU. */ public final ColumnVector day() { - assert type.isTimestampType(); - return new ColumnVector(day(getNativeView())); + return extractDateTimeComponent(DateTimeComponent.DAY); } /** @@ -961,8 +968,7 @@ public final ColumnVector day() { * @return - A new INT16 vector allocated on the GPU. */ public final ColumnVector hour() { - assert type.hasTimeResolution(); - return new ColumnVector(hour(getNativeView())); + return extractDateTimeComponent(DateTimeComponent.HOUR); } /** @@ -973,8 +979,7 @@ public final ColumnVector hour() { * @return - A new INT16 vector allocated on the GPU. */ public final ColumnVector minute() { - assert type.hasTimeResolution(); - return new ColumnVector(minute(getNativeView())); + return extractDateTimeComponent(DateTimeComponent.MINUTE); } /** @@ -985,8 +990,7 @@ public final ColumnVector minute() { * @return A new INT16 vector allocated on the GPU. */ public final ColumnVector second() { - assert type.hasTimeResolution(); - return new ColumnVector(second(getNativeView())); + return extractDateTimeComponent(DateTimeComponent.SECOND); } /** @@ -997,8 +1001,7 @@ public final ColumnVector second() { * @return A new INT16 vector allocated on the GPU. Monday=1, ..., Sunday=7 */ public final ColumnVector weekDay() { - assert type.isTimestampType(); - return new ColumnVector(weekDay(getNativeView())); + return extractDateTimeComponent(DateTimeComponent.WEEKDAY); } /** @@ -1045,6 +1048,16 @@ public final ColumnVector addCalendricalMonths(ColumnView months) { return new ColumnVector(addCalendricalMonths(getNativeView(), months.getNativeView())); } + /** + * Add the specified number of months to the timestamp. + * @param months must be a INT16 scalar indicating the number of months to add. A negative number + * of months works too. + * @return the updated timestamp + */ + public final ColumnVector addCalendricalMonths(Scalar months) { + return new ColumnVector(addScalarCalendricalMonths(getNativeView(), months.getScalarHandle())); + } + /** * Check to see if the year for this timestamp is a leap year or not. * @return BOOL8 vector of results @@ -1053,6 +1066,45 @@ public final ColumnVector isLeapYear() { return new ColumnVector(isLeapYear(getNativeView())); } + /** + * Extract the number of days in the month + * @return INT16 column of the number of days in the corresponding month + */ + public final ColumnVector daysInMonth() { + assert type.isTimestampType(); + return new ColumnVector(daysInMonth(getNativeView())); + } + + /** + * Round the timestamp up to the given frequency keeping the type the same. + * @param freq what part of the timestamp to round. + * @return a timestamp with the same type, but rounded up. + */ + public final ColumnVector dateTimeCeil(DateTimeRoundingFrequency freq) { + assert type.isTimestampType(); + return new ColumnVector(dateTimeCeil(getNativeView(), freq.getNativeId())); + } + + /** + * Round the timestamp down to the given frequency keeping the type the same. + * @param freq what part of the timestamp to round. + * @return a timestamp with the same type, but rounded down. + */ + public final ColumnVector dateTimeFloor(DateTimeRoundingFrequency freq) { + assert type.isTimestampType(); + return new ColumnVector(dateTimeFloor(getNativeView(), freq.getNativeId())); + } + + /** + * Round the timestamp (half up) to the given frequency keeping the type the same. + * @param freq what part of the timestamp to round. + * @return a timestamp with the same type, but rounded (half up). + */ + public final ColumnVector dateTimeRound(DateTimeRoundingFrequency freq) { + assert type.isTimestampType(); + return new ColumnVector(dateTimeRound(getNativeView(), freq.getNativeId())); + } + /** * Rounds all the values in a column to the specified number of decimal places. * @@ -3332,6 +3384,36 @@ public final ColumnVector stringContains(Scalar compString) { return new ColumnVector(stringContains(getNativeView(), compString.getScalarHandle())); } + /** + * @brief Searches for the given target strings within each string in the provided column + * + * Each column in the result table corresponds to the result for the target string at the same + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th, + * etc. + * + * If the target is not found for a string, false is returned for that entry in the output column. + * If the target is an empty string, true is returned for all non-null entries in the output column. + * + * Any null input strings return corresponding null entries in the output columns. + * + * input = ["a", "b", "c"] + * targets = ["a", "c"] + * output is a table with two boolean columns: + * column 0: [true, false, false] + * column 1: [false, false, true] + * + * @param targets UTF-8 encoded strings to search for in each string in `input` + * @return BOOL8 columns + */ + public final ColumnVector[] stringContains(ColumnView targets) { + assert type.equals(DType.STRING) : "column type must be a String"; + assert targets.getType().equals(DType.STRING) : "targets type must be a string"; + assert targets.getNullCount() == 0 : "targets must not contain nulls"; + assert targets.getRowCount() > 0 : "targets must not be empty"; + long[] resultPointers = stringContainsMulti(getNativeView(), targets.getNativeView()); + return Arrays.stream(resultPointers).mapToObj(ColumnVector::new).toArray(ColumnVector[]::new); + } + /** * Replaces values less than `lo` in `input` with `lo`, * and values greater than `hi` with `hi`. @@ -4437,6 +4519,13 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat */ private static native long stringContains(long cudfViewHandle, long compString) throws CudfException; + /** + * Native method for searching for the given target strings within each string in the provided column. + * @param cudfViewHandle native handle of the cudf::column_view being operated on. + * @param targetViewHandle handle of the column view containing the strings being searched for. + */ + private static native long[] stringContainsMulti(long cudfViewHandle, long targetViewHandle) throws CudfException; + /** * Native method for extracting results from a regex program pattern. Returns a table handle. * @@ -4684,19 +4773,7 @@ private static native long segmentedGather(long sourceColumnHandle, long gatherM private static native long unaryOperation(long viewHandle, int op); - private static native long year(long viewHandle) throws CudfException; - - private static native long month(long viewHandle) throws CudfException; - - private static native long day(long viewHandle) throws CudfException; - - private static native long hour(long viewHandle) throws CudfException; - - private static native long minute(long viewHandle) throws CudfException; - - private static native long second(long viewHandle) throws CudfException; - - private static native long weekDay(long viewHandle) throws CudfException; + private static native long extractDateTimeComponent(long viewHandle, int component); private static native long lastDayOfMonth(long viewHandle) throws CudfException; @@ -4706,8 +4783,18 @@ private static native long segmentedGather(long sourceColumnHandle, long gatherM private static native long addCalendricalMonths(long tsViewHandle, long monthsViewHandle); + private static native long addScalarCalendricalMonths(long tsViewHandle, long scalarHandle); + private static native long isLeapYear(long viewHandle) throws CudfException; + private static native long daysInMonth(long viewHandle) throws CudfException; + + private static native long dateTimeCeil(long viewHandle, int freq); + + private static native long dateTimeFloor(long viewHandle, int freq); + + private static native long dateTimeRound(long viewHandle, int freq); + private static native boolean containsScalar(long columnViewHaystack, long scalarHandle) throws CudfException; private static native long containsVector(long valuesHandle, long searchSpaceHandle) throws CudfException; diff --git a/java/src/main/java/ai/rapids/cudf/DateTimeComponent.java b/java/src/main/java/ai/rapids/cudf/DateTimeComponent.java new file mode 100644 index 00000000000..0f1618e29fb --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/DateTimeComponent.java @@ -0,0 +1,74 @@ +/* + * + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * Types of datetime components that may be extracted. + */ +public enum DateTimeComponent { + /** + * year as an INT16 + */ + YEAR(0), + /** + * month 1 - jan, as an INT16 + */ + MONTH(1), + /** + * Day of the month as an INT16 + */ + DAY(2), + /** + * day of the week, Monday=1, ..., Sunday=7 as an INT16 + */ + WEEKDAY(3), + /** + * hour of the day 24-hour clock as an INT16 + */ + HOUR(4), + /** + * minutes past the hour as an INT16 + */ + MINUTE(5), + /** + * seconds past the minute as an INT16 + */ + SECOND(6), + /** + * milliseconds past the seconds as an INT16 + */ + MILLISECOND(7), + /** + * microseconds past the millisecond as an INT16 + */ + MICROSECOND(8), + /** + * nanoseconds past the microsecond as an INT16 + */ + NANOSECOND(9); + + final int id; + DateTimeComponent(int id) { + this.id = id; + } + + public int getNativeId() { + return id; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/DateTimeRoundingFrequency.java b/java/src/main/java/ai/rapids/cudf/DateTimeRoundingFrequency.java new file mode 100644 index 00000000000..44a7a2f279d --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/DateTimeRoundingFrequency.java @@ -0,0 +1,38 @@ +/* + * + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +public enum DateTimeRoundingFrequency { + DAY(0), + HOUR(1), + MINUTE(2), + SECOND(3), + MILLISECOND(4), + MICROSECOND(5), + NANOSECOND(6); + + final int id; + DateTimeRoundingFrequency(int id) { + this.id = id; + } + + public int getNativeId() { + return id; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java b/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java index 469c8d16fe4..3178a3ea309 100644 --- a/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java +++ b/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ public static HostMemoryAllocator get() { /** * Sets a new default host memory allocator implementation by default. - * @param hostMemoryAllocator + * @param hostMemoryAllocator the new allocator to use. */ public static void set(HostMemoryAllocator hostMemoryAllocator) { instance = hostMemoryAllocator; @@ -40,11 +40,17 @@ public static void set(HostMemoryAllocator hostMemoryAllocator) { @Override public HostMemoryBuffer allocate(long bytes, boolean preferPinned) { - return HostMemoryBuffer.allocate(bytes, preferPinned); + if (preferPinned) { + HostMemoryBuffer pinnedBuffer = PinnedMemoryPool.tryAllocate(bytes); + if (pinnedBuffer != null) { + return pinnedBuffer; + } + } + return new HostMemoryBuffer(UnsafeMemoryAccessor.allocate(bytes), bytes); } @Override public HostMemoryBuffer allocate(long bytes) { - return HostMemoryBuffer.allocate(bytes); + return allocate(bytes, HostMemoryBuffer.defaultPreferPinned); } } diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java index e48b1cf59e4..86b6b98f2ae 100644 --- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java +++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ * that is backing it. */ public class DeviceMemoryBufferView extends BaseDeviceMemoryBuffer { - DeviceMemoryBufferView(long address, long lengthInBytes) { + public DeviceMemoryBufferView(long address, long lengthInBytes) { // Set the cleaner to null so we don't end up releasing anything super(address, lengthInBytes, (MemoryBufferCleaner) null); } diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java index e4106574a19..7251ce643d4 100644 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,7 +41,7 @@ * Be aware that the off heap memory limits set by Java do not apply to these buffers. */ public class HostMemoryBuffer extends MemoryBuffer { - private static final boolean defaultPreferPinned; + static final boolean defaultPreferPinned; private static final Logger log = LoggerFactory.getLogger(HostMemoryBuffer.class); static { @@ -135,13 +135,7 @@ public boolean isClean() { * @return the newly created buffer */ public static HostMemoryBuffer allocate(long bytes, boolean preferPinned) { - if (preferPinned) { - HostMemoryBuffer pinnedBuffer = PinnedMemoryPool.tryAllocate(bytes); - if (pinnedBuffer != null) { - return pinnedBuffer; - } - } - return new HostMemoryBuffer(UnsafeMemoryAccessor.allocate(bytes), bytes); + return DefaultHostMemoryAllocator.get().allocate(bytes, preferPinned); } /** @@ -155,6 +149,16 @@ public static HostMemoryBuffer allocate(long bytes) { return allocate(bytes, defaultPreferPinned); } + /** + * Allocate host memory bypassing the default allocator. This is intended to only be used by other allocators. + * Pinned memory will not be used for these allocations. + * @param bytes size in bytes to allocate + * @return the newly created buffer + */ + public static HostMemoryBuffer allocateRaw(long bytes) { + return new HostMemoryBuffer(UnsafeMemoryAccessor.allocate(bytes), bytes); + } + /** * Create a host buffer that is memory-mapped to a file. * @param path path to the file to map into host memory @@ -245,8 +249,10 @@ public final void copyFromHostBuffer(long destOffset, HostMemoryBuffer srcData, * @param destOffset offset in bytes in this buffer to start copying to * @param in input stream to copy bytes from * @param byteLength number of bytes to copy + * @throws EOFException If there are not enough bytes in the stream to copy. + * @throws IOException If there is an error reading from the stream. */ - final void copyFromStream(long destOffset, InputStream in, long byteLength) throws IOException { + public final void copyFromStream(long destOffset, InputStream in, long byteLength) throws IOException { addressOutOfBoundsCheck(address + destOffset, byteLength, "copy from stream"); byte[] arrayBuffer = new byte[(int) Math.min(1024 * 128, byteLength)]; long left = byteLength; @@ -254,7 +260,7 @@ final void copyFromStream(long destOffset, InputStream in, long byteLength) thro int amountToCopy = (int) Math.min(arrayBuffer.length, left); int amountRead = in.read(arrayBuffer, 0, amountToCopy); if (amountRead < 0) { - throw new EOFException(); + throw new EOFException("Unexpected end of stream, expected " + left + " more bytes"); } setBytes(destOffset, arrayBuffer, 0, amountRead); destOffset += amountRead; diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java deleted file mode 100644 index 72c2e659372..00000000000 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package ai.rapids.cudf; - -/** - * Represents some amount of host memory that has been reserved. A reservation guarantees that one - * or more allocations up to the reserved amount, minus padding for alignment will succeed. A - * reservation typically guarantees the amount can be allocated one, meaning when a buffer - * allocated from a reservation is freed it is not returned to the reservation, but to the pool of - * memory the reservation originally came from. If more memory is allocated from the reservation - * an OutOfMemoryError may be thrown, but it is not guaranteed to happen. - * - * When the reservation is closed any unused reservation will be returned to the pool of memory - * the reservation came from. - */ -public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {} diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java index 7ed8e0354c9..68a3856f37d 100644 --- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java +++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java @@ -28,7 +28,16 @@ public enum RegexFlag { DEFAULT(0), // default MULTILINE(8), // the '^' and '$' honor new-line characters DOTALL(16), // the '.' matching includes new-line characters - ASCII(256); // use only ASCII when matching built-in character classes + ASCII(256), // use only ASCII when matching built-in character classes + /** + * EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters + * - NEXT_LINE ('\u0085') + * - LINE_SEPARATOR ('\u2028') + * - PARAGRAPH_SEPARATOR ('\u2029') + * - CARRIAGE_RETURN ('\r') + * - NEW_LINE ('\n') + */ + EXT_NEWLINE(512); final int nativeId; // Native id, for use with libcudf. private RegexFlag(int nativeId) { // Only constant values should be used diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java index 76b2799aad6..ae8a0e17f9d 100644 --- a/java/src/main/java/ai/rapids/cudf/Schema.java +++ b/java/src/main/java/ai/rapids/cudf/Schema.java @@ -29,26 +29,52 @@ public class Schema { public static final Schema INFERRED = new Schema(); private final DType topLevelType; + + /** + * Default value for precision value, when it is not specified or the column type is not decimal. + */ + private static final int UNKNOWN_PRECISION = -1; + + /** + * Store precision for the top level column, only applicable if the column is a decimal type. + *

+ * This variable is not designed to be used by any libcudf's APIs since libcudf does not support + * precisions for fixed point numbers. + * Instead, it is used only to pass down the precision values from Spark's DecimalType to the + * JNI level, where some JNI functions require these values to perform their operations. + */ + private final int topLevelPrecision; + private final List childNames; private final List childSchemas; private boolean flattened = false; private String[] flattenedNames; private DType[] flattenedTypes; + private int[] flattenedPrecisions; private int[] flattenedCounts; private Schema(DType topLevelType, + int topLevelPrecision, List childNames, List childSchemas) { this.topLevelType = topLevelType; + this.topLevelPrecision = topLevelPrecision; this.childNames = childNames; this.childSchemas = childSchemas; } + private Schema(DType topLevelType, + List childNames, + List childSchemas) { + this(topLevelType, UNKNOWN_PRECISION, childNames, childSchemas); + } + /** * Inferred schema. */ private Schema() { topLevelType = null; + topLevelPrecision = UNKNOWN_PRECISION; childNames = null; childSchemas = null; } @@ -104,14 +130,17 @@ private void flattenIfNeeded() { if (flatLen == 0) { flattenedNames = null; flattenedTypes = null; + flattenedPrecisions = null; flattenedCounts = null; } else { String[] names = new String[flatLen]; DType[] types = new DType[flatLen]; + int[] precisions = new int[flatLen]; int[] counts = new int[flatLen]; - collectFlattened(names, types, counts, 0); + collectFlattened(names, types, precisions, counts, 0); flattenedNames = names; flattenedTypes = types; + flattenedPrecisions = precisions; flattenedCounts = counts; } flattened = true; @@ -128,19 +157,20 @@ private int flattenedLength(int startingLength) { return startingLength; } - private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) { + private int collectFlattened(String[] names, DType[] types, int[] precisions, int[] counts, int offset) { if (childSchemas != null) { for (int i = 0; i < childSchemas.size(); i++) { Schema child = childSchemas.get(i); names[offset] = childNames.get(i); types[offset] = child.topLevelType; + precisions[offset] = child.topLevelPrecision; if (child.childNames != null) { counts[offset] = child.childNames.size(); } else { counts[offset] = 0; } offset++; - offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset); + offset = this.childSchemas.get(i).collectFlattened(names, types, precisions, counts, offset); } } return offset; @@ -226,6 +256,22 @@ public int[] getFlattenedTypeScales() { return ret; } + /** + * Get decimal precisions of the columns' types flattened from all levels in schema by + * depth-first traversal. + *

+ * This is used to pass down the decimal precisions from Spark to only the JNI layer, where + * some JNI functions require precision values to perform their operations. + * Decimal precisions should not be consumed by any libcudf's APIs since libcudf does not + * support precisions for fixed point numbers. + * + * @return An array containing decimal precision of all columns in schema. + */ + public int[] getFlattenedDecimalPrecisions() { + flattenIfNeeded(); + return flattenedPrecisions; + } + /** * Get the types of the columns in schema flattened from all levels by depth-first traversal. * @return An array containing types of all columns in schema. @@ -307,11 +353,13 @@ public HostColumnVector.DataType asHostDataType() { public static class Builder { private final DType topLevelType; + private final int topLevelPrecision; private final List names; private final List types; - private Builder(DType topLevelType) { + private Builder(DType topLevelType, int topLevelPrecision) { this.topLevelType = topLevelType; + this.topLevelPrecision = topLevelPrecision; if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) { // There can be children names = new ArrayList<>(); @@ -322,14 +370,19 @@ private Builder(DType topLevelType) { } } + private Builder(DType topLevelType) { + this(topLevelType, UNKNOWN_PRECISION); + } + /** * Add a new column * @param type the type of column to add * @param name the name of the column to add (Ignored for list types) + * @param precision the decimal precision, only applicable for decimal types * @return the builder for the new column. This should really only be used when the type * passed in is a LIST or a STRUCT. */ - public Builder addColumn(DType type, String name) { + public Builder addColumn(DType type, String name, int precision) { if (names == null) { throw new IllegalStateException("A column of type " + topLevelType + " cannot have children"); @@ -340,21 +393,31 @@ public Builder addColumn(DType type, String name) { if (names.contains(name)) { throw new IllegalStateException("Cannot add duplicate names to a schema"); } - Builder ret = new Builder(type); + Builder ret = new Builder(type, precision); types.add(ret); names.add(name); return ret; } + public Builder addColumn(DType type, String name) { + return addColumn(type, name, UNKNOWN_PRECISION); + } + /** * Adds a single column to the current schema. addColumn is preferred as it can be used * to support nested types. * @param type the type of the column. * @param name the name of the column. + * @param precision the decimal precision, only applicable for decimal types. * @return this for chaining. */ + public Builder column(DType type, String name, int precision) { + addColumn(type, name, precision); + return this; + } + public Builder column(DType type, String name) { - addColumn(type, name); + addColumn(type, name, UNKNOWN_PRECISION); return this; } @@ -366,7 +429,7 @@ public Schema build() { children.add(b.build()); } } - return new Schema(topLevelType, names, children); + return new Schema(topLevelType, topLevelPrecision, names, children); } } } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index dbee53640aa..b01ce31b1f3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -259,7 +259,6 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, - boolean pruneColumns, boolean experimental, byte lineDelimiter) throws CudfException; @@ -275,7 +274,6 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, - boolean pruneColumns, boolean experimental, byte lineDelimiter, long dsHandle) throws CudfException; @@ -1092,224 +1090,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { return readJSON(schema, opts, buffer, 0, buffer.length); } - private static class DidViewChange { - ColumnVector changeWasNeeded = null; - boolean noChangeNeeded = false; - - public static DidViewChange yes(ColumnVector cv) { - DidViewChange ret = new DidViewChange(); - ret.changeWasNeeded = cv; - return ret; - } - - public static DidViewChange no() { - DidViewChange ret = new DidViewChange(); - ret.noChangeNeeded = true; - return ret; - } - } - - private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.NestedChildren children, - ColumnView cv) { - // We need to do this recursively to be sure it all matches as expected. - // If we run into problems where the data types don't match, we are not - // going to fix up the data types. We are only going to reorder the columns. - if (schema.getType() == DType.STRUCT) { - if (cv.getType() != DType.STRUCT) { - // The types don't match so just return the input unchanged... - return DidViewChange.no(); - } else { - String[] foundNames; - if (children == null) { - foundNames = new String[0]; - } else { - foundNames = children.getNames(); - } - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - String[] neededNames = schema.getColumnNames(); - ColumnView[] columns = new ColumnView[neededNames.length]; - try { - boolean somethingChanged = false; - if (columns.length != foundNames.length) { - somethingChanged = true; - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededNames[i]; - Integer index = indices.get(neededColumnName); - Schema childSchema = schema.getChild(i); - if (index != null) { - if (childSchema.isStructOrHasStructDescendant()) { - ColumnView child = cv.getChildColumnView(index); - boolean shouldCloseChild = true; - try { - if (index != i) { - somethingChanged = true; - } - DidViewChange childResult = gatherJSONColumns(schema.getChild(i), - children.getChild(index), child); - if (childResult.noChangeNeeded) { - shouldCloseChild = false; - columns[i] = child; - } else { - somethingChanged = true; - columns[i] = childResult.changeWasNeeded; - } - } finally { - if (shouldCloseChild) { - child.close(); - } - } - } else { - if (index != i) { - somethingChanged = true; - } - columns[i] = cv.getChildColumnView(index); - } - } else { - somethingChanged = true; - if (types[i] == DType.LIST) { - try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else if (types[i] == DType.STRUCT) { - int numStructChildren = childSchema.getNumChildren(); - HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren]; - for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) { - structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildren)) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } - } - } - if (somethingChanged) { - try (ColumnView ret = new ColumnView(cv.type, cv.rows, Optional.of(cv.nullCount), - cv.getValid(), null, columns)) { - return DidViewChange.yes(ret.copyToColumnVector()); - } - } else { - return DidViewChange.no(); - } - } finally { - for (ColumnView c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } else if (schema.getType() == DType.LIST && cv.getType() == DType.LIST) { - if (schema.isStructOrHasStructDescendant()) { - String [] childNames = children.getNames(); - if (childNames.length == 2 && - "offsets".equals(childNames[0]) && - "element".equals(childNames[1])) { - try (ColumnView child = cv.getChildColumnView(0)){ - DidViewChange listResult = gatherJSONColumns(schema.getChild(0), - children.getChild(1), child); - if (listResult.noChangeNeeded) { - return DidViewChange.no(); - } else { - try (ColumnView listView = new ColumnView(cv.type, cv.rows, - Optional.of(cv.nullCount), cv.getValid(), cv.getOffsets(), - new ColumnView[]{listResult.changeWasNeeded})) { - return DidViewChange.yes(listView.copyToColumnVector()); - } finally { - listResult.changeWasNeeded.close(); - } - } - } - } - } - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } else { - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } - } - - private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) { - String[] neededColumns = schema.getColumnNames(); - if (neededColumns == null || neededColumns.length == 0) { - return twm.releaseTable(); - } else { - String[] foundNames = twm.getColumnNames(); - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - ColumnVector[] columns = new ColumnVector[neededColumns.length]; - try (Table tbl = twm.releaseTable()) { - int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount(); - if (rowCount < 0) { - throw new IllegalStateException( - "No empty row count provided and the table read has no row count or columns"); - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededColumns[i]; - Integer index = indices.get(neededColumnName); - if (index != null) { - if (schema.getChild(i).isStructOrHasStructDescendant()) { - DidViewChange gathered = gatherJSONColumns(schema.getChild(i), twm.getChild(index), - tbl.getColumn(index)); - if (gathered.noChangeNeeded) { - columns[i] = tbl.getColumn(index).incRefCount(); - } else { - columns[i] = gathered.changeWasNeeded; - } - } else { - columns[i] = tbl.getColumn(index).incRefCount(); - } - } else { - if (types[i] == DType.LIST) { - Schema listSchema = schema.getChild(i); - Schema elementSchema = listSchema.getChild(0); - try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else if (types[i] == DType.STRUCT) { - Schema structSchema = schema.getChild(i); - int numStructChildren = structSchema.getNumChildren(); - DataType[] structChildrenTypes = new DataType[numStructChildren]; - for (int j = 0; j < numStructChildren; j++) { - structChildrenTypes[j] = structSchema.getChild(j).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildrenTypes)) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } - } - } - return new Table(columns); - } finally { - for (ColumnVector c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } - /** * Read a JSON file. * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. @@ -1318,10 +1098,6 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp * @return the file parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, File path) { - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta( readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -1336,11 +1112,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, -1); + return twm.releaseTable(); } } @@ -1361,6 +1136,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param buffer raw UTF8 formatted bytes. @@ -1370,6 +1149,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, HostMemoryAllocator hostMemoryAllocator, int emptyRowCount) { @@ -1381,14 +1161,14 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon assert offset >= 0 && offset < buffer.length; try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) { newBuf.setBytes(0, buffer, offset, len); - return readJSON(schema, opts, newBuf, 0, len, emptyRowCount); + return readJSON(schema, opts, newBuf, 0, len); } } + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, int emptyRowCount) { - return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(), - emptyRowCount); + return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); } public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, @@ -1470,6 +1250,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param buffer raw UTF8 formatted bytes. @@ -1478,6 +1262,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b * @param emptyRowCount the number of rows to use if no columns were found. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, long offset, long len, int emptyRowCount) { if (len <= 0) { @@ -1486,10 +1271,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert len > 0; assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, @@ -1505,10 +1286,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } } @@ -1525,18 +1305,19 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param ds the DataSource to read from. * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1550,11 +1331,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter(), dsHandle))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 32045f3c50e..9ff43feeac6 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -43,6 +43,7 @@ option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) option(USE_GDS "Build with GPUDirect Storage (GDS)/cuFile support" OFF) option(CUDF_JNI_LIBCUDF_STATIC "Link with libcudf.a" OFF) option(CUDF_JNI_ENABLE_PROFILING "Build with profiling support" ON) +option(CUDA_STATIC_CUFILE "Statically link cuFile" OFF) message(VERBOSE "CUDF_JNI: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF_JNI: Build cuDF JNI shared libraries: ${BUILD_SHARED_LIBS}") @@ -120,7 +121,12 @@ endif() if(USE_GDS) message(STATUS "Building with GPUDirect Storage (GDS)/cuFile support") - find_package(cuFile REQUIRED) + if(CUDA_STATIC_CUFILE) + set(_cufile_suffix _static) + endif() + if(NOT TARGET CUDA::cuFile${_cufile_suffix}) + message(FATAL_ERROR "cuFile support not found") + endif() endif() # ################################################################################################## @@ -228,8 +234,7 @@ if(USE_GDS) POSITION_INDEPENDENT_CODE ON INTERFACE_POSITION_INDEPENDENT_CODE ON ) - target_include_directories(cufilejni PRIVATE "${cuFile_INCLUDE_DIRS}") - target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}") + target_link_libraries(cufilejni PRIVATE cudfjni CUDA::cuFile${_cufile_suffix}) endif() # ################################################################################################## diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 72f0ad19912..6a59ae3ddd5 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -127,6 +128,20 @@ std::size_t calc_device_memory_size(cudf::column_view const& view, bool const pa }); } +cudf::datetime::rounding_frequency as_rounding_freq(jint freq) +{ + switch (freq) { + case 0: return cudf::datetime::rounding_frequency::DAY; + case 1: return cudf::datetime::rounding_frequency::HOUR; + case 2: return cudf::datetime::rounding_frequency::MINUTE; + case 3: return cudf::datetime::rounding_frequency::SECOND; + case 4: return cudf::datetime::rounding_frequency::MILLISECOND; + case 5: return cudf::datetime::rounding_frequency::MICROSECOND; + case 6: return cudf::datetime::rounding_frequency::NANOSECOND; + default: throw std::invalid_argument("Invalid rounding_frequency"); + } +} + } // anonymous namespace extern "C" { @@ -1099,147 +1114,172 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_round( CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv* env, jclass, jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractDateTimeComponent(JNIEnv* env, + jclass, + jlong input_ptr, + jint component) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::extract_year(*input)); + cudf::datetime::datetime_component comp; + switch (component) { + case 0: comp = cudf::datetime::datetime_component::YEAR; break; + case 1: comp = cudf::datetime::datetime_component::MONTH; break; + case 2: comp = cudf::datetime::datetime_component::DAY; break; + case 3: comp = cudf::datetime::datetime_component::WEEKDAY; break; + case 4: comp = cudf::datetime::datetime_component::HOUR; break; + case 5: comp = cudf::datetime::datetime_component::MINUTE; break; + case 6: comp = cudf::datetime::datetime_component::SECOND; break; + case 7: comp = cudf::datetime::datetime_component::MILLISECOND; break; + case 8: comp = cudf::datetime::datetime_component::MICROSECOND; break; + case 9: comp = cudf::datetime::datetime_component::NANOSECOND; break; + default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid component", 0); + } + return release_as_jlong(cudf::datetime::extract_datetime_component(*input, comp)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv* env, jclass, jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv* env, + jclass, + jlong input_ptr) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::extract_month(*input)); + return release_as_jlong(cudf::datetime::last_day_of_month(*input)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv* env, jclass, jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv* env, + jclass, + jlong input_ptr) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::extract_day(*input)); + return release_as_jlong(cudf::datetime::day_of_year(*input)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv* env, jclass, jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv* env, + jclass, + jlong input_ptr) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::extract_hour(*input)); + return release_as_jlong(cudf::datetime::extract_quarter(*input)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv* env, jclass, jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIEnv* env, + jclass, + jlong ts_ptr, + jlong months_ptr) { - JNI_NULL_CHECK(env, input_ptr, "input is null", 0); + JNI_NULL_CHECK(env, ts_ptr, "ts is null", 0); + JNI_NULL_CHECK(env, months_ptr, "months is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::extract_minute(*input)); + cudf::column_view const* ts = reinterpret_cast(ts_ptr); + cudf::column_view const* months = reinterpret_cast(months_ptr); + return release_as_jlong(cudf::datetime::add_calendrical_months(*ts, *months)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv* env, jclass, jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addScalarCalendricalMonths(JNIEnv* env, + jclass, + jlong ts_ptr, + jlong months_ptr) { - JNI_NULL_CHECK(env, input_ptr, "input is null", 0); + JNI_NULL_CHECK(env, ts_ptr, "ts is null", 0); + JNI_NULL_CHECK(env, months_ptr, "months is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::extract_second(*input)); + cudf::column_view const* ts = reinterpret_cast(ts_ptr); + cudf::scalar const* months = reinterpret_cast(months_ptr); + return release_as_jlong(cudf::datetime::add_calendrical_months(*ts, *months)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv* env, jclass, jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv* env, + jclass, + jlong input_ptr) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::extract_weekday(*input)); + return release_as_jlong(cudf::datetime::is_leap_year(*input)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv* env, - jclass, - jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_daysInMonth(JNIEnv* env, + jclass, + jlong input_ptr) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::last_day_of_month(*input)); + return release_as_jlong(cudf::datetime::days_in_month(*input)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv* env, - jclass, - jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dateTimeCeil(JNIEnv* env, + jclass, + jlong input_ptr, + jint freq) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::day_of_year(*input)); + cudf::column_view const* input = reinterpret_cast(input_ptr); + cudf::datetime::rounding_frequency n_freq = as_rounding_freq(freq); + return release_as_jlong(cudf::datetime::ceil_datetimes(*input, n_freq)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv* env, +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dateTimeFloor(JNIEnv* env, jclass, - jlong input_ptr) + jlong input_ptr, + jint freq) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::extract_quarter(*input)); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIEnv* env, - jclass, - jlong ts_ptr, - jlong months_ptr) -{ - JNI_NULL_CHECK(env, ts_ptr, "ts is null", 0); - JNI_NULL_CHECK(env, months_ptr, "months is null", 0); - try { - cudf::jni::auto_set_device(env); - cudf::column_view const* ts = reinterpret_cast(ts_ptr); - cudf::column_view const* months = reinterpret_cast(months_ptr); - return release_as_jlong(cudf::datetime::add_calendrical_months(*ts, *months)); + cudf::column_view const* input = reinterpret_cast(input_ptr); + cudf::datetime::rounding_frequency n_freq = as_rounding_freq(freq); + return release_as_jlong(cudf::datetime::floor_datetimes(*input, n_freq)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv* env, - jclass, - jlong input_ptr) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dateTimeRound(JNIEnv* env, + jclass, + jlong input_ptr, + jint freq) { JNI_NULL_CHECK(env, input_ptr, "input is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view const* input = reinterpret_cast(input_ptr); - return release_as_jlong(cudf::datetime::is_leap_year(*input)); + cudf::column_view const* input = reinterpret_cast(input_ptr); + cudf::datetime::rounding_frequency n_freq = as_rounding_freq(freq); + return release_as_jlong(cudf::datetime::round_datetimes(*input, n_freq)); } CATCH_STD(env, 0); } @@ -2827,4 +2867,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv* env, jclass } CATCH_STD(env, 0); } + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringContainsMulti( + JNIEnv* env, jobject j_object, jlong j_view_handle, jlong j_target_view_handle) +{ + JNI_NULL_CHECK(env, j_view_handle, "column is null", 0); + JNI_NULL_CHECK(env, j_target_view_handle, "targets is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto* column_view = reinterpret_cast(j_view_handle); + auto* targets_view = reinterpret_cast(j_target_view_handle); + auto const strings_column = cudf::strings_column_view(*column_view); + auto const targets_column = cudf::strings_column_view(*targets_view); + auto contains_results = cudf::strings::contains_multiple(strings_column, targets_column); + return cudf::jni::convert_table_for_return(env, std::move(contains_results)); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0a667978ca3..1f8b1ea207d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1037,21 +1037,23 @@ cudf::io::schema_element read_schema_element(int& index, if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { std::map child_elems; int num_children = children[index]; + std::vector child_names(num_children); // go to the next entry, so recursion can parse it. index++; for (int i = 0; i < num_children; i++) { - auto const name = std::string{names.get(index).get()}; + auto name = std::string{names.get(index).get()}; child_elems.insert( std::pair{name, cudf::jni::read_schema_element(index, children, names, types, scales)}); + child_names[i] = std::move(name); } - return cudf::io::schema_element{d_type, std::move(child_elems)}; + return cudf::io::schema_element{d_type, std::move(child_elems), {std::move(child_names)}}; } else { if (children[index] != 0) { throw std::invalid_argument("found children for a type that should have none"); } // go to the next entry before returning... index++; - return cudf::io::schema_element{d_type, {}}; + return cudf::io::schema_element{d_type, {}, std::nullopt}; } } @@ -1824,7 +1826,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean experimental, jbyte line_delimiter, jlong ds_handle) @@ -1853,6 +1854,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) @@ -1864,7 +1866,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns) .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -1886,13 +1887,19 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, } std::map data_types; + std::vector name_order; int at = 0; while (at < n_types.size()) { auto const name = std::string{n_col_names.get(at).get()}; data_types.insert(std::pair{ name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); + name_order.push_back(name); } - opts.dtypes(data_types); + auto const prune_columns = data_types.size() != 0; + cudf::io::schema_element structs{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; + opts.prune_columns(prune_columns).dtypes(structs); + } else { // should infer the types } @@ -1925,7 +1932,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean experimental, jbyte line_delimiter) { @@ -1968,6 +1974,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) @@ -1979,7 +1986,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns) .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -2001,13 +2007,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, } std::map data_types; + std::vector name_order; + name_order.reserve(n_types.size()); int at = 0; while (at < n_types.size()) { - auto const name = std::string{n_col_names.get(at).get()}; + auto name = std::string{n_col_names.get(at).get()}; data_types.insert(std::pair{ name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); + name_order.emplace_back(std::move(name)); } - opts.dtypes(data_types); + auto const prune_columns = data_types.size() != 0; + cudf::io::schema_element structs{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; + opts.prune_columns(prune_columns).dtypes(structs); } else { // should infer the types } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 708744569df..d1a1ff2c95c 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -31,6 +31,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.EnumSet; import java.util.List; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; @@ -3827,6 +3828,30 @@ void testStringOpsEmpty() { } } + @Test + void testStringContainsMulti() { + ColumnVector[] results = null; + try (ColumnVector haystack = ColumnVector.fromStrings("tést strings", + "Héllo cd", + "1 43 42 7", + "scala spark 42 other", + null, + ""); + ColumnVector targets = ColumnVector.fromStrings("é", "42"); + ColumnVector expected0 = ColumnVector.fromBoxedBooleans(true, true, false, false, null, false); + ColumnVector expected1 = ColumnVector.fromBoxedBooleans(false, false, true, true, null, false)) { + results = haystack.stringContains(targets); + assertColumnsAreEqual(results[0], expected0); + assertColumnsAreEqual(results[1], expected1); + } finally { + if (results != null) { + for (ColumnVector c : results) { + c.close(); + } + } + } + } + @Test void testStringFindOperations() { try (ColumnVector testStrings = ColumnVector.fromStrings("", null, "abCD", "1a\"\u0100B1", "a\"\u0100B1", "1a\"\u0100B", @@ -3877,6 +3902,43 @@ void testExtractRe() { } } + @Test +void testExtractReWithMultiLineDelimiters() { + String NEXT_LINE = "\u0085"; + String LINE_SEPARATOR = "\u2028"; + String PARAGRAPH_SEPARATOR = "\u2029"; + String CARRIAGE_RETURN = "\r"; + String NEW_LINE = "\n"; + + try (ColumnVector input = ColumnVector.fromStrings( + "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::", + "boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll", + "boo::", + "", + "boo::" + NEW_LINE, + "boo::" + CARRIAGE_RETURN, + "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR, + "boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR, + "boo:" + NEXT_LINE + "boo::" + NEXT_LINE); + Table expected_ext_newline = new Table.TestBuilder() + .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::") + .build(); + Table expected_default = new Table.TestBuilder() + .column("boo:::", null, "boo::", null, "boo::", null, null, null, null) + .build()) { + + // Regex pattern to match 'boo:' followed by one or more colons at the end of the string + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) { + assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]); + } + + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) { + assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]); + } + } + } + + @Test void testExtractAllRecord() { String pattern = "([ab])(\\d)"; diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java index 8cc7df1ce7f..6bd6603d71b 100644 --- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java +++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java @@ -612,13 +612,13 @@ void testWithSetOutputType() { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.666667f); + try (Scalar expected = Scalar.fromFloat(1.6666666f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.variance(DType.FLOAT32)) { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.2909945f); + try (Scalar expected = Scalar.fromFloat(1.2909944f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.standardDeviation(DType.FLOAT32)) { assertEquals(expected, result); diff --git a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java index c22acac747e..bac83310c99 100644 --- a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -237,6 +237,69 @@ public void getSecond() { } } + @Test + public void getExtractMillis() { + try (ColumnVector timestampColumnVector = ColumnVector.timestampMilliSecondsFromLongs(TIMES_MS)) { + assert timestampColumnVector.getType().equals(DType.TIMESTAMP_MILLISECONDS); + try (ColumnVector tmp = timestampColumnVector.extractDateTimeComponent(DateTimeComponent.MILLISECOND); + HostColumnVector result = tmp.copyToHost()) { + assertEquals(238, result.getShort(0)); + assertEquals(115, result.getShort(1)); + assertEquals(929, result.getShort(2)); + } + } + + try (ColumnVector timestampColumnVector = ColumnVector.timestampSecondsFromLongs(TIMES_S); + ColumnVector tmp = timestampColumnVector.extractDateTimeComponent(DateTimeComponent.MILLISECOND); + HostColumnVector result = tmp.copyToHost()) { + assertEquals(0, result.getShort(0)); + assertEquals(0, result.getShort(1)); + assertEquals(0, result.getShort(2)); + } + } + + @Test + public void getExtractMicro() { + try (ColumnVector timestampColumnVector = ColumnVector.timestampMicroSecondsFromLongs(TIMES_US)) { + assert timestampColumnVector.getType().equals(DType.TIMESTAMP_MICROSECONDS); + try (ColumnVector tmp = timestampColumnVector.extractDateTimeComponent(DateTimeComponent.MICROSECOND); + HostColumnVector result = tmp.copyToHost()) { + assertEquals(297, result.getShort(0)); + assertEquals(254, result.getShort(1)); + assertEquals(861, result.getShort(2)); + } + } + + try (ColumnVector timestampColumnVector = ColumnVector.timestampSecondsFromLongs(TIMES_S); + ColumnVector tmp = timestampColumnVector.extractDateTimeComponent(DateTimeComponent.MICROSECOND); + HostColumnVector result = tmp.copyToHost()) { + assertEquals(0, result.getShort(0)); + assertEquals(0, result.getShort(1)); + assertEquals(0, result.getShort(2)); + } + } + + @Test + public void getExtractNano() { + try (ColumnVector timestampColumnVector = ColumnVector.timestampNanoSecondsFromLongs(TIMES_NS)) { + assert timestampColumnVector.getType().equals(DType.TIMESTAMP_NANOSECONDS); + try (ColumnVector tmp = timestampColumnVector.extractDateTimeComponent(DateTimeComponent.NANOSECOND); + HostColumnVector result = tmp.copyToHost()) { + assertEquals(531, result.getShort(0)); + assertEquals(330, result.getShort(1)); + assertEquals(604, result.getShort(2)); + } + } + + try (ColumnVector timestampColumnVector = ColumnVector.timestampSecondsFromLongs(TIMES_S); + ColumnVector tmp = timestampColumnVector.extractDateTimeComponent(DateTimeComponent.NANOSECOND); + HostColumnVector result = tmp.copyToHost()) { + assertEquals(0, result.getShort(0)); + assertEquals(0, result.getShort(1)); + assertEquals(0, result.getShort(2)); + } + } + @Test public void testWeekDay() { try (ColumnVector timestampColumnVector = ColumnVector.timestampMilliSecondsFromLongs(TIMES_MS); @@ -350,6 +413,59 @@ public void testAddMonths() { } } + @Test + public void testAddMonthsScalar() { + long[] EXPECTED = new long[]{ + -129290327762L, //'1965-11-26 14:01:12.238' Friday + 1533384000115L, //'2018-08-04 12:00:00.115' Saturday + 1677310332929L, //'2023-03-25 07:32:12.929' Saturday + -129290327762L, //'1965-12-26 14:01:12.238' Sunday + 1533384000115L}; //'2018-09-04 12:00:00.115' Tuesday + try (ColumnVector timestampColumnVector = ColumnVector.timestampMilliSecondsFromLongs(TIMES_MS); + Scalar months = Scalar.fromShort((short)1); + ColumnVector result = timestampColumnVector.addCalendricalMonths(months); + ColumnVector expected = ColumnVector.timestampMilliSecondsFromLongs(EXPECTED)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + public void testDaysInMonth() { + Integer[] DAYS = new Integer[] { + 0, // Jan 1, 1970 + 31, // Feb 1, 1970 + 59, // Mar 1, 1970 + 90, // Apr 1, 1970 + 120, // May 1, 1970 + 151, // June 1, 1970 + 181, // July 1, 1970 + 212, // Aug 1, 1970 + 243, // Sep 1, 1970 + 273, // OCt 1, 1970 + 304, // Nov 1, 1970 + 334 // Dec 1 1970 + }; + short[] EXPECTED = new short[]{ + 31, // Jan 1970 + 28, // Feb 1970 + 31, // Mar 1970 + 30, // Apr 1970 + 31, // May 1970 + 30, // June 1970 + 31, // July 1970 + 31, // Aug 1970 + 30, // Sep 1970 + 31, // Oct 1970 + 30, // Nov 1970 + 31 // Dec 1970 + }; + try (ColumnVector timestampColumnVector = ColumnVector.timestampDaysFromBoxedInts(DAYS); + ColumnVector result = timestampColumnVector.daysInMonth(); + ColumnVector expected = ColumnVector.fromShorts(EXPECTED)) { + assertColumnsAreEqual(expected, result); + } + } + @Test public void testIsLeapYear() { Boolean[] EXPECTED = new Boolean[]{false, false, false, false, false}; @@ -383,6 +499,86 @@ public void testIsLeapYear() { } } + @Test + public void testCeilDays() { + long[] EXPECTED_NS = new long[]{ + -131932800000000000L, //'1965-10-27 00:00:00.000000000' + 1530748800000000000L, //'2018-07-05 00:00:00.000000000' + 1674691200000000000L, //'2023-01-26 00:00:00.000000000' + -131932800000000000L, //'1965-10-27 00:00:00.000000000' + 1530748800000000000L}; //'2018-07-05 00:00:00.000000000' + try (ColumnVector timestampColumnVector = ColumnVector.timestampNanoSecondsFromLongs(TIMES_NS); + ColumnVector result = timestampColumnVector.dateTimeCeil(DateTimeRoundingFrequency.DAY); + ColumnVector expected = ColumnVector.timestampNanoSecondsFromLongs(EXPECTED_NS)) { + assertColumnsAreEqual(expected, result); + } + long[] EXPECTED_US = new long[]{ + -131932800000000L, //'1965-10-27 00:00:00.000000' + 1530748800000000L, //'2018-07-05 00:00:00.000000' + 1674691200000000L, //'2023-01-26 00:00:00.000000' + -131932800000000L, //'1965-10-27 00:00:00.000000' + 1530748800000000L}; //'2018-07-05 00:00:00.000000' + try (ColumnVector timestampColumnVector = ColumnVector.timestampMicroSecondsFromLongs(TIMES_US); + ColumnVector result = timestampColumnVector.dateTimeCeil(DateTimeRoundingFrequency.DAY); + ColumnVector expected = ColumnVector.timestampMicroSecondsFromLongs(EXPECTED_US)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + public void testFloorDays() { + long[] EXPECTED_NS = new long[]{ + -132019200000000000L, //'1965-10-26 00:00:00.000000000' + 1530662400000000000L, //'2018-07-04 00:00:00.000000000' + 1674604800000000000L, //'2023-01-25 00:00:00.000000000' + -132019200000000000L, //'1965-10-26 00:00:00.000000000' + 1530662400000000000L}; //'2018-07-04 00:00:00.000000000' + try (ColumnVector timestampColumnVector = ColumnVector.timestampNanoSecondsFromLongs(TIMES_NS); + ColumnVector result = timestampColumnVector.dateTimeFloor(DateTimeRoundingFrequency.DAY); + ColumnVector expected = ColumnVector.timestampNanoSecondsFromLongs(EXPECTED_NS)) { + assertColumnsAreEqual(expected, result); + } + + long[] EXPECTED_US = new long[]{ + -132019200000000L, //'1965-10-26 00:00:00.000000' + 1530662400000000L, //'2018-07-04 00:00:00.000000' + 1674604800000000L, //'2023-01-25 00:00:00.000000' + -132019200000000L, //'1965-10-26 00:00:00.000000' + 1530662400000000L}; //'2018-07-04 00:00:00.000000' + try (ColumnVector timestampColumnVector = ColumnVector.timestampMicroSecondsFromLongs(TIMES_US); + ColumnVector result = timestampColumnVector.dateTimeFloor(DateTimeRoundingFrequency.DAY); + ColumnVector expected = ColumnVector.timestampMicroSecondsFromLongs(EXPECTED_US)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + public void testRoundDays() { + long[] EXPECTED_NS = new long[]{ + -131932800000000000L, //'1965-10-27 00:00:00.000000000' + 1530748800000000000L, //'2018-07-05 00:00:00.000000000' + 1674604800000000000L, //'2023-01-25 00:00:00.000000000' + -131932800000000000L, //'1965-10-27 00:00:00.000000000' + 1530748800000000000L}; //'2018-07-05 00:00:00.000000000' + try (ColumnVector timestampColumnVector = ColumnVector.timestampNanoSecondsFromLongs(TIMES_NS); + ColumnVector result = timestampColumnVector.dateTimeRound(DateTimeRoundingFrequency.DAY); + ColumnVector expected = ColumnVector.timestampNanoSecondsFromLongs(EXPECTED_NS)) { + assertColumnsAreEqual(expected, result); + } + + long[] EXPECTED_US = new long[]{ + -131932800000000L, //'1965-10-27 00:00:00.000000' + 1530748800000000L, //'2018-07-05 00:00:00.000000' + 1674604800000000L, //'2023-01-25 00:00:00.000000' + -131932800000000L, //'1965-10-27 00:00:00.000000' + 1530748800000000L}; //'2018-07-05 00:00:00.000000' + try (ColumnVector timestampColumnVector = ColumnVector.timestampMicroSecondsFromLongs(TIMES_US); + ColumnVector result = timestampColumnVector.dateTimeRound(DateTimeRoundingFrequency.DAY); + ColumnVector expected = ColumnVector.timestampMicroSecondsFromLongs(EXPECTED_US)) { + assertColumnsAreEqual(expected, result); + } + } + @Test public void testCastToTimestamp() { try (ColumnVector timestampMillis = ColumnVector.timestampMilliSecondsFromLongs(TIMES_MS); diff --git a/pyproject.toml b/pyproject.toml index 8f9aa165e5a..6933484f4e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ select = [ "F", # pycodestyle Warning "W", + # isort + "I", # no-blank-line-before-function "D201", # one-blank-line-after-class @@ -90,6 +92,8 @@ select = [ "UP007", # Import from `collections.abc` instead: `Callable` "UP035", + # usage of legacy `np.random` function calls + "NPY002", ] ignore = [ # whitespace before : diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py index 93109838900..f902111b0db 100644 --- a/python/cudf/benchmarks/API/bench_functions.py +++ b/python/cudf/benchmarks/API/bench_functions.py @@ -72,12 +72,13 @@ def bench_pivot_table_simple(benchmark, dataframe): @pytest_cases.parametrize("nr", NUM_ROWS) def bench_crosstab_simple(benchmark, nr): + rng = np.random.default_rng(seed=0) series_a = np.array(["foo", "bar"] * nr) series_b = np.array(["one", "two"] * nr) series_c = np.array(["dull", "shiny"] * nr) - np.random.shuffle(series_a) - np.random.shuffle(series_b) - np.random.shuffle(series_c) + rng.shuffle(series_a) + rng.shuffle(series_b) + rng.shuffle(series_c) series_a = cudf.Series(series_a) series_b = cudf.Series(series_b) series_c = cudf.Series(series_c) diff --git a/python/cudf/benchmarks/API/bench_multiindex.py b/python/cudf/benchmarks/API/bench_multiindex.py index 6268bcc4267..77004c3313e 100644 --- a/python/cudf/benchmarks/API/bench_multiindex.py +++ b/python/cudf/benchmarks/API/bench_multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of MultiIndex methods.""" @@ -11,16 +11,18 @@ @pytest.fixture def pidx(): num_elements = int(1e3) - a = np.random.randint(0, num_elements // 10, num_elements) - b = np.random.randint(0, num_elements // 10, num_elements) + rng = np.random.default_rng(seed=0) + a = rng.integers(0, num_elements // 10, num_elements) + b = rng.integers(0, num_elements // 10, num_elements) return pd.MultiIndex.from_arrays([a, b], names=("a", "b")) @pytest.fixture def midx(pidx): num_elements = int(1e3) - a = np.random.randint(0, num_elements // 10, num_elements) - b = np.random.randint(0, num_elements // 10, num_elements) + rng = np.random.default_rng(seed=0) + a = rng.integers(0, num_elements // 10, num_elements) + b = rng.integers(0, num_elements // 10, num_elements) df = cudf.DataFrame({"a": a, "b": b}) return cudf.MultiIndex.from_frame(df) diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 7b2b71cf216..0e4afadccf5 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -56,27 +56,23 @@ # into the main repo. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) -from config import cudf # noqa: W0611, E402, F401 -from utils import ( # noqa: E402 - OrderedSet, - collapse_fixtures, - column_generators, - make_fixture, -) - # Turn off isort until we upgrade to 5.8.0 # https://github.com/pycqa/isort/issues/1594 -# isort: off from config import ( # noqa: W0611, E402, F401 NUM_COLS, NUM_ROWS, collect_ignore, + cudf, # noqa: W0611, E402, F401 pytest_collection_modifyitems, pytest_sessionfinish, pytest_sessionstart, ) - -# isort: on +from utils import ( # noqa: E402 + OrderedSet, + collapse_fixtures, + column_generators, + make_fixture, +) @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"]) diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py index d9974037daa..172193aa672 100644 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ b/python/cudf/cudf/_fuzz_testing/avro.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import io @@ -68,12 +68,12 @@ def generate_input(self): # https://github.com/rapidsai/cudf/issues/6604 - cudf.utils.dtypes.TIMEDELTA_TYPES ) - + seed = random.randint(0, 2**32 - 1) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) + self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols @@ -100,17 +100,18 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "columns": col_size = self._rand(len(self._df.columns)) params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) + np.unique(rng.choice(self._df.columns, col_size)) ) elif param in ("skiprows", "num_rows"): - params_dict[param] = np.random.choice( + params_dict[param] = rng.choice( [None, self._rand(len(self._df))] ) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 67211a1c4bf..fa3ed40ce91 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -54,7 +54,7 @@ def generate_input(self): random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed @@ -77,25 +77,22 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "usecols": col_size = self._rand(len(self._df.columns)) - col_val = np.random.choice( + col_val = rng.choice( [ None, - np.unique( - np.random.choice(self._df.columns, col_size) - ), + np.unique(rng.choice(self._df.columns, col_size)), ] ) params_dict[param] = ( col_val if col_val is None else list(col_val) ) elif param == "dtype": - dtype_val = np.random.choice( - [None, self._df.dtypes.to_dict()] - ) + dtype_val = rng.choice([None, self._df.dtypes.to_dict()]) if dtype_val is not None: dtype_val = { col_name: "category" @@ -105,25 +102,25 @@ def set_rand_params(self, params): } params_dict[param] = dtype_val elif param == "header": - header_val = np.random.choice( - ["infer", np.random.randint(low=0, high=len(self._df))] + header_val = rng.choice( + ["infer", rng.integers(low=0, high=len(self._df))] ) params_dict[param] = header_val elif param == "skiprows": - params_dict[param] = np.random.randint( + params_dict[param] = rng.integers( low=0, high=len(self._df) ) elif param == "skipfooter": - params_dict[param] = np.random.randint( + params_dict[param] = rng.integers( low=0, high=len(self._df) ) elif param == "nrows": - nrows_val = np.random.choice( - [None, np.random.randint(low=0, high=len(self._df))] + nrows_val = rng.choice( + [None, rng.integers(low=0, high=len(self._df))] ) params_dict[param] = nrows_val else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) @@ -159,7 +156,7 @@ def generate_input(self): random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed @@ -182,26 +179,25 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "columns": col_size = self._rand(len(self._current_buffer.columns)) params_dict[param] = list( np.unique( - np.random.choice( - self._current_buffer.columns, col_size - ) + rng.choice(self._current_buffer.columns, col_size) ) ) elif param == "chunksize": - params_dict[param] = np.random.choice( + params_dict[param] = rng.choice( [ None, - np.random.randint( + rng.integers( low=1, high=max(1, len(self._current_buffer)) ), ] ) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py index ffb7171a855..a4b8e18d8b4 100644 --- a/python/cudf/cudf/_fuzz_testing/io.py +++ b/python/cudf/cudf/_fuzz_testing/io.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import json @@ -91,8 +91,9 @@ def get_next_regression_params(self): return dtypes_meta, num_rows, num_cols, seed def set_rand_params(self, params): + rng = np.random.default_rng(seed=None) params_dict = { - param: np.random.choice(values) for param, values in params.items() + param: rng.choice(values) for param, values in params.items() } self._current_params["test_kwargs"] = self.process_kwargs( params_dict=params_dict diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index e987529c8ba..45d2c8d8cf0 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -80,7 +80,7 @@ def generate_input(self): # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed @@ -105,14 +105,15 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if param == "dtype" and values == ALL_POSSIBLE_VALUES: - dtype_val = np.random.choice( + dtype_val = rng.choice( [True, self._current_buffer.dtypes.to_dict()] ) params_dict[param] = _get_dtype_param_value(dtype_val) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) @@ -155,7 +156,7 @@ def generate_input(self): # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed @@ -180,12 +181,13 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if param == "dtype" and values == ALL_POSSIBLE_VALUES: - dtype_val = np.random.choice( + dtype_val = rng.choice( [True, self._current_buffer.dtypes.to_dict()] ) params_dict[param] = _get_dtype_param_value(dtype_val) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index ecddc72fa85..4d9e4abb09e 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import io @@ -62,13 +62,11 @@ def generate_input(self): - cudf.utils.dtypes.UNSIGNED_TYPES - {"datetime64[ns]"} ) - + seed = random.randint(0, 2**32 - 1) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) - self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols @@ -94,42 +92,41 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "columns": col_size = self._rand(len(self._df.columns)) params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) + np.unique(rng.choice(self._df.columns, col_size)) ) elif param == "stripes": f = io.BytesIO(self._current_buffer) orcFile = pa.orc.ORCFile(f) stripes = list(range(orcFile.nstripes)) - params_dict[param] = np.random.choice( + params_dict[param] = rng.choice( [ None, list( map( int, np.unique( - np.random.choice( - stripes, orcFile.nstripes - ) + rng.choice(stripes, orcFile.nstripes) ), ) ), ] ) elif param == "use_index": - params_dict[param] = np.random.choice([True, False]) + params_dict[param] = rng.choice([True, False]) elif param in ("skiprows", "num_rows"): - params_dict[param] = np.random.choice( + params_dict[param] = rng.choice( [None, self._rand(len(self._df))] ) else: if not isinstance(values, list): raise TypeError("values must be of type list") - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) @@ -177,12 +174,11 @@ def generate_input(self): # https://github.com/rapidsai/cudf/issues/7355 - cudf.utils.dtypes.DATETIME_TYPES ) - + seed = random.randint(0, 2**32 - 1) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 2d934e4816d..bd3df1b0847 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import logging import random @@ -59,12 +59,11 @@ def generate_input(self): - {"uint32"} | {"list", "decimal64"} ) - + seed = random.randint(0, 2**32 - 1) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols @@ -96,14 +95,15 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if param == "columns" and values == ALL_POSSIBLE_VALUES: col_size = self._rand(len(self._df.columns)) params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) + np.unique(rng.choice(self._df.columns, col_size)) ) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) @@ -146,7 +146,7 @@ def generate_input(self): | {"list", "decimal64"} ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py index 3d070576a12..bbc19dce1a4 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import sys @@ -68,7 +68,9 @@ def parquet_writer_test(pdf): @pythonfuzz( data_handle=ParquetWriter, params={ - "row_group_size": np.random.random_integers(1, 10000, 100), + "row_group_size": np.random.default_rng(seed=0).integers( + 1, 10000, 100 + ), "compression": ["snappy", None], }, ) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 8ce92e1c0f6..4cadb3a109c 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -40,8 +40,11 @@ } -def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): +def _generate_rand_meta( + obj, dtypes_list, null_frequency_override=None, seed=0 +): obj._current_params = {} + rng = np.random.default_rng(seed=seed) num_rows = obj._rand(obj._max_rows) num_cols = obj._rand(obj._max_columns) @@ -69,12 +72,12 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["max_string_length"] = obj._max_string_length elif dtype == "list": if obj._max_lists_length is None: - meta["lists_max_length"] = np.random.randint(0, 2000000000) + meta["lists_max_length"] = rng.integers(0, 2000000000) else: meta["lists_max_length"] = obj._max_lists_length if obj._max_lists_nesting_depth is None: - meta["nesting_max_depth"] = np.random.randint( + meta["nesting_max_depth"] = rng.integers( 1, np.iinfo("int64").max ) else: @@ -85,7 +88,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): ) elif dtype == "struct": if obj._max_lists_nesting_depth is None: - meta["nesting_max_depth"] = np.random.randint(2, 10) + meta["nesting_max_depth"] = rng.integers(2, 10) else: meta["nesting_max_depth"] = obj._max_lists_nesting_depth @@ -95,9 +98,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["max_null_frequency"] = obj._max_struct_null_frequency if obj._max_struct_types_at_each_level is None: - meta["max_types_at_each_level"] = np.random.randint( - low=1, high=10 - ) + meta["max_types_at_each_level"] = rng.integers(low=1, high=10) else: meta["max_types_at_each_level"] = ( obj._max_struct_types_at_each_level diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 5d4b5421f16..2958c286d20 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -14,27 +14,22 @@ set(cython_sources aggregation.pyx - avro.pyx binaryop.pyx column.pyx - concat.pyx copying.pyx csv.pyx datetime.pyx filling.pyx groupby.pyx - hash.pyx interop.pyx join.pyx json.pyx - labeling.pyx lists.pyx merge.pyx null_mask.pyx orc.pyx parquet.pyx partitioning.pyx - quantiles.pyx reduce.pyx replace.pyx reshape.pyx @@ -51,7 +46,6 @@ set(cython_sources transform.pyx transpose.pyx types.pyx - unary.pyx utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 918edb6d3f1..19dc4488560 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -2,26 +2,21 @@ import numpy as np from . import ( - avro, binaryop, - concat, copying, csv, datetime, filling, groupby, - hash, interop, join, json, - labeling, merge, null_mask, nvtext, orc, parquet, partitioning, - quantiles, reduce, replace, reshape, @@ -36,7 +31,6 @@ text, timezone, transpose, - unary, ) MAX_COLUMN_SIZE = np.iinfo(np.int32).max diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 7c91533cf93..3c96b90f0a1 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -78,8 +78,11 @@ class Aggregation: ) @classmethod - def nunique(cls): - return cls(pylibcudf.aggregation.nunique(pylibcudf.types.NullPolicy.EXCLUDE)) + def nunique(cls, dropna=True): + return cls(pylibcudf.aggregation.nunique( + pylibcudf.types.NullPolicy.EXCLUDE + if dropna else pylibcudf.types.NullPolicy.INCLUDE + )) @classmethod def nth(cls, size): diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx deleted file mode 100644 index b1759635a36..00000000000 --- a/python/cudf/cudf/_lib/avro.pyx +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.utils cimport data_from_pylibcudf_io - -import pylibcudf as plc -from pylibcudf.io.types import SourceInfo - - -cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1): - """ - Cython function to call libcudf read_avro, see `read_avro`. - - See Also - -------- - cudf.io.avro.read_avro - """ - - num_rows = -1 if num_rows is None else num_rows - skip_rows = 0 if skip_rows is None else skip_rows - - if not isinstance(num_rows, int) or num_rows < -1: - raise TypeError("num_rows must be an int >= -1") - if not isinstance(skip_rows, int) or skip_rows < 0: - raise TypeError("skip_rows must be an int >= 0") - - return data_from_pylibcudf_io( - plc.io.avro.read_avro( - SourceInfo([datasource]), - columns, - skip_rows, - num_rows - ) - ) diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 8ceea4920e2..8b1d16f0d85 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport ( mutable_column_view, ) from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef class Column: diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index bb38488eefb..bdd90be45b8 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,8 +2,12 @@ from __future__ import annotations +from typing import Literal + from typing_extensions import Self +import pylibcudf as plc + from cudf._typing import Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase @@ -71,3 +75,8 @@ class Column: # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... + @staticmethod + def from_pylibcudf( + col: plc.Column, data_ptr_exposed: bool = False + ) -> ColumnBase: ... + def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ... diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 99e4c21df8a..94dbdf5534d 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -28,7 +28,7 @@ from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.types cimport ( dtype_from_column_view, @@ -688,15 +688,18 @@ cdef class Column: # special case for string column is_string_column = (cv.type().id() == libcudf_types.type_id.STRING) if is_string_column: - # get the size from offset child column (device to host copy) - offsets_column_index = 0 - offset_child_column = cv.child(offsets_column_index) - if offset_child_column.size() == 0: + if cv.num_children() == 0: base_nbytes = 0 else: - chars_size = get_element( - offset_child_column, offset_child_column.size()-1).value - base_nbytes = chars_size + # get the size from offset child column (device to host copy) + offsets_column_index = 0 + offset_child_column = cv.child(offsets_column_index) + if offset_child_column.size() == 0: + base_nbytes = 0 + else: + chars_size = get_element( + offset_child_column, offset_child_column.size()-1).value + base_nbytes = chars_size if data_ptr: if data_owner is None: diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx deleted file mode 100644 index e6c2d136f0d..00000000000 --- a/python/cudf/cudf/_lib/concat.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport data_from_pylibcudf_table - -import pylibcudf - -from cudf.core.buffer import acquire_spill_lock - - -@acquire_spill_lock() -def concat_columns(object columns): - return Column.from_pylibcudf( - pylibcudf.concatenate.concatenate( - [col.to_pylibcudf(mode="read") for col in columns] - ) - ) - - -@acquire_spill_lock() -def concat_tables(object tables, bool ignore_index=False): - plc_tables = [] - for table in tables: - cols = table._columns - if not ignore_index: - cols = table._index._columns + cols - plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols])) - - return data_from_pylibcudf_table( - pylibcudf.concatenate.concatenate(plc_tables), - column_names=tables[0]._column_names, - index_names=None if ignore_index else tables[0]._index_names - ) diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd deleted file mode 100644 index 14c7d2066d8..00000000000 --- a/python/cudf/cudf/_lib/copying.pxd +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from pylibcudf.libcudf.contiguous_split cimport packed_columns - - -cdef class _CPackedColumns: - cdef packed_columns c_obj - cdef object column_names - cdef object column_dtypes - cdef object index_names diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 49714091f46..c478cd1a990 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,46 +1,30 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -import pickle - -from libc.stdint cimport uint8_t, uintptr_t from libcpp cimport bool -from libcpp.memory cimport make_shared, shared_ptr, unique_ptr +from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from libcpp.vector cimport vector - -from rmm._lib.device_buffer cimport DeviceBuffer - import pylibcudf import cudf -from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer - +from cudf.core.buffer import acquire_spill_lock, as_buffer +from cudf.core.abc import Serializable from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_table from cudf._lib.reduce import minmax -from cudf.core.abc import Serializable from libcpp.memory cimport make_unique -cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.gather cimport ( - segmented_gather as cpp_segmented_gather, -) -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type -from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view - -# workaround for https://github.com/cython/cython/issues/3885 -ctypedef const scalar constscalar +from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table +import pylibcudf as plc +from pylibcudf.contiguous_split cimport PackedColumns as PlcPackedColumns def _gather_map_is_valid( @@ -339,74 +323,37 @@ def get_element(Column input_column, size_type index): ) -@acquire_spill_lock() -def segmented_gather(Column source_column, Column gather_map): - cdef shared_ptr[lists_column_view] source_LCV = ( - make_shared[lists_column_view](source_column.view()) - ) - cdef shared_ptr[lists_column_view] gather_map_LCV = ( - make_shared[lists_column_view](gather_map.view()) - ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_segmented_gather( - source_LCV.get()[0], gather_map_LCV.get()[0]) - ) - - result = Column.from_unique_ptr(move(c_result)) - return result - - -cdef class _CPackedColumns: - - @staticmethod - def from_py_table(input_table, keep_index=True): - """ - Construct a ``PackedColumns`` object from a ``cudf.DataFrame``. - """ - import cudf.core.dtypes - - cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns) - - if keep_index and ( - not isinstance(input_table.index, cudf.RangeIndex) - or input_table.index.start != 0 - or input_table.index.stop != len(input_table) - or input_table.index.step != 1 - ): - input_table_view = table_view_from_table(input_table) - p.index_names = input_table._index_names - else: - input_table_view = table_view_from_table( - input_table, ignore_index=True) - - p.column_names = input_table._column_names - p.column_dtypes = {} - for name, col in input_table._column_labels_and_values: - if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): - p.column_dtypes[name] = col.dtype - - p.c_obj = move(cpp_contiguous_split.pack(input_table_view)) +class PackedColumns(Serializable): + """ + A packed representation of a Frame, with all columns residing + in a single GPU memory buffer. + """ - return p + def __init__( + self, + PlcPackedColumns data, + object column_names = None, + object index_names = None, + object column_dtypes = None + ): + self._metadata, self._gpu_data = data.release() + self.column_names=column_names + self.index_names=index_names + self.column_dtypes=column_dtypes - @property - def gpu_data_ptr(self): - return int(self.c_obj.gpu_data.get()[0].data()) + def __reduce__(self): + return self.deserialize, self.serialize() @property - def gpu_data_size(self): - return int(self.c_obj.gpu_data.get()[0].size()) + def __cuda_array_interface__(self): + return self._gpu_data.__cuda_array_interface__ def serialize(self): header = {} frames = [] - gpu_data = as_buffer( - data=self.gpu_data_ptr, - size=self.gpu_data_size, + data = self._gpu_data.obj.ptr, + size = self._gpu_data.obj.size, owner=self, exposed=True ) @@ -416,65 +363,82 @@ cdef class _CPackedColumns: header["column-names"] = self.column_names header["index-names"] = self.index_names - if self.c_obj.metadata.get()[0].data() != NULL: - header["metadata"] = list( - - self.c_obj.metadata.get()[0].data() - ) - - column_dtypes = {} + header["metadata"] = self._metadata.tobytes() for name, dtype in self.column_dtypes.items(): - dtype_header, dtype_frames = dtype.serialize() - column_dtypes[name] = ( + dtype_header, dtype_frames = dtype.device_serialize() + self.column_dtypes[name] = ( dtype_header, (len(frames), len(frames) + len(dtype_frames)), ) frames.extend(dtype_frames) - header["column-dtypes"] = column_dtypes - + header["column-dtypes"] = self.column_dtypes return header, frames - @staticmethod - def deserialize(header, frames): - cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns) - - gpu_data = Buffer.deserialize(header["data"], frames) - - dbuf = DeviceBuffer( - ptr=gpu_data.get_ptr(mode="write"), - size=gpu_data.nbytes - ) - - cdef cpp_contiguous_split.packed_columns data - data.metadata = move( - make_unique[vector[uint8_t]]( - move(header.get("metadata", [])) + @classmethod + def deserialize(cls, header, frames): + column_dtypes = {} + for name, dtype in header["column-dtypes"].items(): + dtype_header, (start, stop) = dtype + column_dtypes[name] = Serializable.device_deserialize( + dtype_header, frames[start:stop] ) + return cls( + plc.contiguous_split.pack( + plc.contiguous_split.unpack_from_memoryviews( + memoryview(header["metadata"]), + plc.gpumemoryview(frames[0]), + ) + ), + header["column-names"], + header["index-names"], + column_dtypes, ) - data.gpu_data = move(dbuf.c_obj) - p.c_obj = move(data) - p.column_names = header["column-names"] - p.index_names = header["index-names"] + @classmethod + def from_py_table(cls, input_table, keep_index=True): + if keep_index and ( + not isinstance(input_table.index, cudf.RangeIndex) + or input_table.index.start != 0 + or input_table.index.stop != len(input_table) + or input_table.index.step != 1 + ): + columns = input_table._index._columns + input_table._columns + index_names = input_table._index_names + else: + columns = input_table._columns + index_names = None + column_names = input_table._column_names column_dtypes = {} - for name, dtype in header["column-dtypes"].items(): - dtype_header, (start, stop) = dtype - column_dtypes[name] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize(dtype_header, frames[start:stop]) - p.column_dtypes = column_dtypes - - return p + for name, col in input_table._column_labels_and_values: + if isinstance( + col.dtype, + (cudf.core.dtypes._BaseDtype, cudf.core.dtypes.CategoricalDtype) + ): + column_dtypes[name] = col.dtype + + return cls( + plc.contiguous_split.pack( + plc.Table( + [ + col.to_pylibcudf(mode="read") for col in columns + ] + ) + ), + column_names, + index_names, + column_dtypes, + ) def unpack(self): - output_table = cudf.DataFrame._from_data(*data_from_table_view( - cpp_contiguous_split.unpack(self.c_obj), - self, + output_table = cudf.DataFrame._from_data(*data_from_pylibcudf_table( + plc.contiguous_split.unpack_from_memoryviews( + self._metadata, + self._gpu_data + ), self.column_names, self.index_names )) - for name, dtype in self.column_dtypes.items(): output_table._data[name] = ( output_table._data[name]._with_type_metadata(dtype) @@ -483,46 +447,6 @@ cdef class _CPackedColumns: return output_table -class PackedColumns(Serializable): - """ - A packed representation of a Frame, with all columns residing - in a single GPU memory buffer. - """ - - def __init__(self, data): - self._data = data - - def __reduce__(self): - return self.deserialize, self.serialize() - - @property - def __cuda_array_interface__(self): - return { - "data": (self._data.gpu_data_ptr, False), - "shape": (self._data.gpu_data_size,), - "strides": None, - "typestr": "|u1", - "version": 0 - } - - def serialize(self): - header, frames = self._data.serialize() - header["type-serialized"] = pickle.dumps(type(self)) - - return header, frames - - @classmethod - def deserialize(cls, header, frames): - return cls(_CPackedColumns.deserialize(header, frames)) - - @classmethod - def from_py_table(cls, input_table, keep_index=True): - return cls(_CPackedColumns.from_py_table(input_table, keep_index)) - - def unpack(self): - return self._data.unpack() - - def pack(input_table, keep_index=True): """ Pack the columns of a cudf Frame into a single GPU memory buffer. diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 9ad96f610b3..59a970263e0 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -1,10 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector cimport pylibcudf.libcudf.types as libcudf_types @@ -23,16 +19,7 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.io.csv cimport ( - csv_writer_options, - write_csv as cpp_write_csv, -) -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport compression_type, sink_info -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.io.utils cimport make_sink_info -from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table +from cudf._lib.utils cimport data_from_pylibcudf_io import pylibcudf as plc @@ -148,13 +135,13 @@ def read_csv( byte_range = (0, 0) if compression is None: - c_compression = compression_type.NONE + c_compression = plc.io.types.CompressionType.NONE else: compression_map = { - "infer": compression_type.AUTO, - "gzip": compression_type.GZIP, - "bz2": compression_type.BZIP2, - "zip": compression_type.ZIP, + "infer": plc.io.types.CompressionType.AUTO, + "gzip": plc.io.types.CompressionType.GZIP, + "bz2": plc.io.types.CompressionType.BZIP2, + "zip": plc.io.types.CompressionType.ZIP, } c_compression = compression_map[compression] @@ -318,59 +305,40 @@ def write_csv( -------- cudf.to_csv """ - cdef table_view input_table_view = table_view_from_table( - table, not index - ) - cdef bool include_header_c = header - cdef char delim_c = ord(sep) - cdef string line_term_c = lineterminator.encode() - cdef string na_c = na_rep.encode() - cdef int rows_per_chunk_c = rows_per_chunk - cdef vector[string] col_names - cdef string true_value_c = 'True'.encode() - cdef string false_value_c = 'False'.encode() - cdef unique_ptr[data_sink] data_sink_c - cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - - if header is True: - all_names = columns_apply_na_rep(table._column_names, na_rep) - if index is True: - all_names = table._index.names + all_names - - if len(all_names) > 0: - col_names.reserve(len(all_names)) - if len(all_names) == 1: - if all_names[0] in (None, ''): - col_names.push_back('""'.encode()) - else: - col_names.push_back( - str(all_names[0]).encode() - ) - else: - for idx, col_name in enumerate(all_names): - if col_name is None: - col_names.push_back(''.encode()) - else: - col_names.push_back( - str(col_name).encode() - ) - - cdef csv_writer_options options = move( - csv_writer_options.builder(sink_info_c, input_table_view) - .names(col_names) - .na_rep(na_c) - .include_header(include_header_c) - .rows_per_chunk(rows_per_chunk_c) - .line_terminator(line_term_c) - .inter_column_delimiter(delim_c) - .true_value(true_value_c) - .false_value(false_value_c) - .build() - ) - + index_and_not_empty = index is True and table.index is not None + columns = [ + col.to_pylibcudf(mode="read") for col in table.index._columns + ] if index_and_not_empty else [] + columns.extend(col.to_pylibcudf(mode="read") for col in table._columns) + col_names = [] + if header: + all_names = list(table.index.names) if index_and_not_empty else [] + all_names.extend( + na_rep if name is None or pd.isnull(name) + else name for name in table._column_names + ) + col_names = [ + '""' if (name in (None, '') and len(all_names) == 1) + else (str(name) if name not in (None, '') else '') + for name in all_names + ] try: - with nogil: - cpp_write_csv(options) + plc.io.csv.write_csv( + ( + plc.io.csv.CsvWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc.Table(columns) + ) + .names(col_names) + .na_rep(na_rep) + .include_header(header) + .rows_per_chunk(rows_per_chunk) + .line_terminator(str(lineterminator)) + .inter_column_delimiter(str(sep)) + .true_value("True") + .false_value("False") + .build() + ) + ) except OverflowError: raise OverflowError( f"Writing CSV file with chunksize={rows_per_chunk} failed. " @@ -419,11 +387,3 @@ cdef DataType _get_plc_data_type_from_dtype(object dtype) except *: dtype = cudf.dtype(dtype) return dtype_to_pylibcudf_type(dtype) - - -def columns_apply_na_rep(column_names, na_rep): - return tuple( - na_rep if pd.isnull(col_name) - else col_name - for col_name in column_names - ) diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index bc5e085ec39..7e8f29dac93 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -4,57 +4,64 @@ import warnings from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - cimport pylibcudf.libcudf.datetime as libcudf_datetime -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.filling cimport calendrical_month_sequence -from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.datetime import DatetimeComponent, RoundingFrequency from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar - import pylibcudf as plc @acquire_spill_lock() def add_months(Column col, Column months): # months must be int16 dtype - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef column_view months_view = months.view() - - with nogil: - c_result = move( - libcudf_datetime.add_calendrical_months( - col_view, - months_view - ) + return Column.from_pylibcudf( + plc.datetime.add_calendrical_months( + col.to_pylibcudf(mode="read"), + months.to_pylibcudf(mode="read") ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def extract_datetime_component(Column col, object field): - result = Column.from_pylibcudf( - plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) - ) - - if field == "weekday": - # Pandas counts Monday-Sunday as 0-6 - # while libcudf counts Monday-Sunday as 1-7 - result = result - result.dtype.type(1) + component_names = { + "year": DatetimeComponent.YEAR, + "month": DatetimeComponent.MONTH, + "day": DatetimeComponent.DAY, + "weekday": DatetimeComponent.WEEKDAY, + "hour": DatetimeComponent.HOUR, + "minute": DatetimeComponent.MINUTE, + "second": DatetimeComponent.SECOND, + "millisecond": DatetimeComponent.MILLISECOND, + "microsecond": DatetimeComponent.MICROSECOND, + "nanosecond": DatetimeComponent.NANOSECOND, + } + if field == "day_of_year": + result = Column.from_pylibcudf( + plc.datetime.day_of_year( + col.to_pylibcudf(mode="read") + ) + ) + elif field in component_names: + result = Column.from_pylibcudf( + plc.datetime.extract_datetime_component( + col.to_pylibcudf(mode="read"), + component_names[field], + ) + ) + if field == "weekday": + # Pandas counts Monday-Sunday as 0-6 + # while libcudf counts Monday-Sunday as 1-7 + result = result - result.dtype.type(1) + else: + raise ValueError(f"Invalid field: '{field}'") return result cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq): - cdef libcudf_datetime.rounding_frequency freq_val - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html old_to_new_freq_map = { "H": "h", @@ -72,96 +79,75 @@ cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq): FutureWarning ) freq = old_to_new_freq_map.get(freq) - if freq == "D": - freq_val = libcudf_datetime.rounding_frequency.DAY - elif freq == "h": - freq_val = libcudf_datetime.rounding_frequency.HOUR - elif freq == "min": - freq_val = libcudf_datetime.rounding_frequency.MINUTE - elif freq == "s": - freq_val = libcudf_datetime.rounding_frequency.SECOND - elif freq == "ms": - freq_val = libcudf_datetime.rounding_frequency.MILLISECOND - elif freq == "us": - freq_val = libcudf_datetime.rounding_frequency.MICROSECOND - elif freq == "ns": - freq_val = libcudf_datetime.rounding_frequency.NANOSECOND + rounding_fequency_map = { + "D": RoundingFrequency.DAY, + "h": RoundingFrequency.HOUR, + "min": RoundingFrequency.MINUTE, + "s": RoundingFrequency.SECOND, + "ms": RoundingFrequency.MILLISECOND, + "us": RoundingFrequency.MICROSECOND, + "ns": RoundingFrequency.NANOSECOND, + } + if freq in rounding_fequency_map: + return rounding_fequency_map[freq] else: raise ValueError(f"Invalid resolution: '{freq}'") - return freq_val @acquire_spill_lock() def ceil_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.ceil_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.ceil_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def floor_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.floor_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.floor_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def round_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.round_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_pylibcudf( + plc.datetime.round_datetimes( + col.to_pylibcudf(mode="read"), + _get_rounding_frequency(freq), + ) + ) @acquire_spill_lock() def is_leap_year(Column col): """Returns a boolean indicator whether the year of the date is a leap year """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.is_leap_year(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.is_leap_year( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def date_range(DeviceScalar start, size_type n, offset): - cdef unique_ptr[column] c_result cdef size_type months = ( offset.kwds.get("years", 0) * 12 + offset.kwds.get("months", 0) ) - - cdef const scalar* c_start = start.get_raw_ptr() - with nogil: - c_result = move(calendrical_month_sequence( + return Column.from_pylibcudf( + plc.filling.calendrical_month_sequence( n, - c_start[0], - months - )) - return Column.from_unique_ptr(move(c_result)) + start.c_value, + months, + ) + ) @acquire_spill_lock() @@ -170,34 +156,28 @@ def extract_quarter(Column col): Returns a column which contains the corresponding quarter of the year for every timestamp inside the input column. """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.extract_quarter(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.extract_quarter( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def days_in_month(Column col): """Extracts the number of days in the month of the date """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.days_in_month(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.days_in_month( + col.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def last_day_of_month(Column col): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.last_day_of_month(col_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.datetime.last_day_of_month( + col.to_pylibcudf(mode="read") + ) + ) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index c199ed96d4f..4e712be6738 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -18,9 +18,6 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib.scalar import as_device_scalar -from pylibcudf.libcudf.replace cimport replace_policy -from pylibcudf.libcudf.scalar.scalar cimport scalar - import pylibcudf from cudf._lib.aggregation import make_aggregation @@ -54,8 +51,6 @@ _DECIMAL_AGGS = { "NUNIQUE", "SUM", } -# workaround for https://github.com/cython/cython/issues/3885 -ctypedef const scalar constscalar @singledispatch @@ -244,13 +239,11 @@ cdef class GroupBy: return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys) def replace_nulls(self, list values, object method): - # TODO: This is using an enum (replace_policy) that has not been exposed in - # pylibcudf yet. We'll want to fix that import once it is in pylibcudf. _, replaced = self._groupby.replace_nulls( pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]), [ - replace_policy.PRECEDING - if method == 'ffill' else replace_policy.FOLLOWING + pylibcudf.replace.ReplacePolicy.PRECEDING + if method == 'ffill' else pylibcudf.replace.ReplacePolicy.FOLLOWING ] * len(values), ) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx deleted file mode 100644 index 9b7ab0888d2..00000000000 --- a/python/cudf/cudf/_lib/hash.pyx +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.hash cimport ( - md5, - murmurhash3_x86_32, - sha1, - sha224, - sha256, - sha384, - sha512, - xxhash_64, -) -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport table_view_from_columns - -import pylibcudf as plc - - -@acquire_spill_lock() -def hash_partition(list source_columns, list columns_to_hash, - int num_partitions): - plc_table, offsets = plc.partitioning.hash_partition( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), - columns_to_hash, - num_partitions - ) - return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets - - -@acquire_spill_lock() -def hash(list source_columns, str method, int seed=0): - cdef table_view c_source_view = table_view_from_columns(source_columns) - cdef unique_ptr[column] c_result - if method == "murmur3": - with nogil: - c_result = move(murmurhash3_x86_32(c_source_view, seed)) - elif method == "md5": - with nogil: - c_result = move(md5(c_source_view)) - elif method == "sha1": - with nogil: - c_result = move(sha1(c_source_view)) - elif method == "sha224": - with nogil: - c_result = move(sha224(c_source_view)) - elif method == "sha256": - with nogil: - c_result = move(sha256(c_source_view)) - elif method == "sha384": - with nogil: - c_result = move(sha384(c_source_view)) - elif method == "sha512": - with nogil: - c_result = move(sha512(c_source_view)) - elif method == "xxhash64": - with nogil: - c_result = move(xxhash_64(c_source_view, seed)) - else: - raise ValueError(f"Unsupported hash function: {method}") - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 1dc586bb257..1c9d3a01b80 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,49 +1,22 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cpython cimport pycapsule -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - import pylibcudf -from pylibcudf.libcudf.interop cimport ( - DLManagedTensor, - from_dlpack as cpp_from_dlpack, - to_dlpack as cpp_to_dlpack, -) -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - columns_from_unique_ptr, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf.core.buffer import acquire_spill_lock from cudf.core.dtypes import ListDtype, StructDtype -def from_dlpack(dlpack_capsule): +def from_dlpack(object dlpack_capsule): """ Converts a DLPack Tensor PyCapsule into a list of columns. DLPack Tensor PyCapsule is expected to have the name "dltensor". """ - cdef DLManagedTensor* dlpack_tensor = pycapsule.\ - PyCapsule_GetPointer(dlpack_capsule, 'dltensor') - pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor') - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_from_dlpack(dlpack_tensor) - ) - - res = columns_from_unique_ptr(move(c_result)) - dlpack_tensor.deleter(dlpack_tensor) - return res + return columns_from_pylibcudf_table( + pylibcudf.interop.from_dlpack(dlpack_capsule) + ) def to_dlpack(list source_columns): @@ -52,39 +25,13 @@ def to_dlpack(list source_columns): DLPack Tensor PyCapsule will have the name "dltensor". """ - if any(column.null_count for column in source_columns): - raise ValueError( - "Cannot create a DLPack tensor with null values. \ - Input is required to have null count as zero." - ) - - cdef DLManagedTensor *dlpack_tensor - cdef table_view source_table_view = table_view_from_columns(source_columns) - - with nogil: - dlpack_tensor = cpp_to_dlpack( - source_table_view + return pylibcudf.interop.to_dlpack( + pylibcudf.Table( + [col.to_pylibcudf(mode="read") for col in source_columns] ) - - return pycapsule.PyCapsule_New( - dlpack_tensor, - 'dltensor', - dlmanaged_tensor_pycapsule_deleter ) -cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: - cdef DLManagedTensor* dlpack_tensor = 0 - try: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'used_dltensor') - return # we do not call a used capsule's deleter - except Exception: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'dltensor') - dlpack_tensor.deleter(dlpack_tensor) - - def gather_metadata(object cols_dtypes): """ Generates a ColumnMetadata vector for each column. diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 76a6e32fde0..96504ebdd66 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -13,7 +13,6 @@ from pylibcudf.libcudf.io.types cimport ( from cudf._lib.column cimport Column -cdef source_info make_source_info(list src) except* cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & data) except* cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 564daefbae2..f23980b387a 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -7,76 +7,20 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector -from pylibcudf.io.datasource cimport Datasource from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.datasource cimport datasource from pylibcudf.libcudf.io.types cimport ( column_name_info, - host_buffer, sink_info, - source_info, ) from cudf._lib.column cimport Column import codecs -import errno import io import os from cudf.core.dtypes import StructDtype - -# Converts the Python source input to libcudf IO source_info -# with the appropriate type and source values -cdef source_info make_source_info(list src) except*: - if not src: - raise ValueError("Need to pass at least one source") - - cdef const unsigned char[::1] c_buffer - cdef vector[host_buffer] c_host_buffers - cdef vector[string] c_files - cdef Datasource csrc - cdef vector[datasource*] c_datasources - empty_buffer = False - if isinstance(src[0], bytes): - empty_buffer = True - for buffer in src: - if (len(buffer) > 0): - c_buffer = buffer - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - empty_buffer = False - elif isinstance(src[0], io.BytesIO): - for bio in src: - c_buffer = bio.getbuffer() # check if empty? - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - # Otherwise src is expected to be a numeric fd, string path, or PathLike. - # TODO (ptaylor): Might need to update this check if accepted input types - # change when UCX and/or cuStreamz support is added. - elif isinstance(src[0], Datasource): - for csrc in src: - c_datasources.push_back(csrc.get_datasource()) - return source_info(c_datasources) - elif isinstance(src[0], (int, float, complex, basestring, os.PathLike)): - # If source is a file, return source_info where type=FILEPATH - if not all(os.path.isfile(file) for file in src): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - src) - - files = [ str(elem).encode() for elem in src] - c_files = files - return source_info(c_files) - else: - raise TypeError("Unrecognized input type: {}".format(type(src[0]))) - - if empty_buffer is True: - c_host_buffers.push_back(host_buffer(NULL, 0)) - - return source_info(c_host_buffers) - # Converts the Python sink input to libcudf IO sink_info. cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & sink diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 9bbbcf60dcf..960010899c1 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -1,6 +1,5 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. -import io import os from collections import abc @@ -9,30 +8,14 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -cimport pylibcudf.libcudf.io.types as cudf_io_types -from pylibcudf.io.types cimport compression_type -from pylibcudf.libcudf.io.json cimport json_recovery_mode_t -from pylibcudf.libcudf.io.types cimport compression_type -from pylibcudf.libcudf.types cimport data_type, type_id -from pylibcudf.types cimport DataType - from cudf._lib.column cimport Column from cudf._lib.io.utils cimport add_df_col_struct_names -from cudf._lib.types cimport dtype_to_data_type +from cudf._lib.types cimport dtype_to_pylibcudf_type from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io import pylibcudf as plc -cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): - if on_bad_lines.lower() == "error": - return json_recovery_mode_t.FAIL - elif on_bad_lines.lower() == "recover": - return json_recovery_mode_t.RECOVER_WITH_NULL - else: - raise TypeError(f"Invalid parameter for {on_bad_lines=}") - - cpdef read_json(object filepaths_or_buffers, object dtype, bool lines, @@ -41,7 +24,7 @@ cpdef read_json(object filepaths_or_buffers, bool keep_quotes, bool mixed_types_as_string, bool prune_columns, - object on_bad_lines): + str on_bad_lines): """ Cython function to call into libcudf API, see `read_json`. @@ -55,28 +38,29 @@ cpdef read_json(object filepaths_or_buffers, # the encoded memoryview externally to ensure the encoded buffer # isn't destroyed before calling libcudf `read_json()` - for idx in range(len(filepaths_or_buffers)): - if isinstance(filepaths_or_buffers[idx], io.StringIO): - filepaths_or_buffers[idx] = \ - filepaths_or_buffers[idx].read().encode() - elif isinstance(filepaths_or_buffers[idx], str) and \ - not os.path.isfile(filepaths_or_buffers[idx]): - filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode() + for idx, source in enumerate(filepaths_or_buffers): + if isinstance(source, str) and not os.path.isfile(source): + filepaths_or_buffers[idx] = source.encode() # Setup arguments - cdef cudf_io_types.compression_type c_compression - if compression is not None: if compression == 'gzip': - c_compression = cudf_io_types.compression_type.GZIP + c_compression = plc.io.types.CompressionType.GZIP elif compression == 'bz2': - c_compression = cudf_io_types.compression_type.BZIP2 + c_compression = plc.io.types.CompressionType.BZIP2 elif compression == 'zip': - c_compression = cudf_io_types.compression_type.ZIP + c_compression = plc.io.types.CompressionType.ZIP else: - c_compression = cudf_io_types.compression_type.AUTO + c_compression = plc.io.types.CompressionType.AUTO else: - c_compression = cudf_io_types.compression_type.NONE + c_compression = plc.io.types.CompressionType.NONE + + if on_bad_lines.lower() == "error": + c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL + elif on_bad_lines.lower() == "recover": + c_on_bad_lines = plc.io.types.JSONRecoveryMode.RECOVER_WITH_NULL + else: + raise TypeError(f"Invalid parameter for {on_bad_lines=}") processed_dtypes = None @@ -108,11 +92,11 @@ cpdef read_json(object filepaths_or_buffers, keep_quotes = keep_quotes, mixed_types_as_string = mixed_types_as_string, prune_columns = prune_columns, - recovery_mode = _get_json_recovery_mode(on_bad_lines) + recovery_mode = c_on_bad_lines ) df = cudf.DataFrame._from_data( *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in res_cols], + columns=[Column.from_pylibcudf(col) for col in res_cols], column_names=res_col_names, index_names=None ) @@ -130,7 +114,7 @@ cpdef read_json(object filepaths_or_buffers, keep_quotes = keep_quotes, mixed_types_as_string = mixed_types_as_string, prune_columns = prune_columns, - recovery_mode = _get_json_recovery_mode(on_bad_lines) + recovery_mode = c_on_bad_lines ) df = cudf.DataFrame._from_data( @@ -189,7 +173,7 @@ def write_json( ) -cdef _get_cudf_schema_element_from_dtype(object dtype) except *: +def _get_cudf_schema_element_from_dtype(object dtype): dtype = cudf.dtype(dtype) if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( @@ -197,7 +181,7 @@ cdef _get_cudf_schema_element_from_dtype(object dtype) except *: "supported in JSON reader" ) - lib_type = DataType.from_libcudf(dtype_to_data_type(dtype)) + lib_type = dtype_to_pylibcudf_type(dtype) child_types = [] if isinstance(dtype, cudf.StructDtype): @@ -210,23 +194,13 @@ cdef _get_cudf_schema_element_from_dtype(object dtype) except *: _get_cudf_schema_element_from_dtype(dtype.element_type) child_types = [ - ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []), + ("offsets", plc.DataType(plc.TypeId.INT32), []), ("element", child_lib_type, grandchild_types) ] return lib_type, child_types -cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: - dtype = cudf.dtype(dtype) - if isinstance(dtype, cudf.CategoricalDtype): - raise NotImplementedError( - "CategoricalDtype as dtype is not yet " - "supported in JSON reader" - ) - return dtype_to_data_type(dtype) - - def _dtype_to_names_list(col): if isinstance(col.dtype, cudf.StructDtype): return [(name, _dtype_to_names_list(child)) diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx deleted file mode 100644 index 3966cce8981..00000000000 --- a/python/cudf/cudf/_lib/labeling.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp cimport bool as cbool - -import pylibcudf as plc - -from cudf._lib.column cimport Column -from cudf.core.buffer import acquire_spill_lock - - -# Note that the parameter input shadows a Python built-in in the local scope, -# but I'm not too concerned about that since there's no use-case for actual -# input in this context. -@acquire_spill_lock() -def label_bins(Column input, Column left_edges, cbool left_inclusive, - Column right_edges, cbool right_inclusive): - plc_column = plc.labeling.label_bins( - input.to_pylibcudf(mode="read"), - left_edges.to_pylibcudf(mode="read"), - left_inclusive, - right_edges.to_pylibcudf(mode="read"), - right_inclusive - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7e8710bedb6..90a137dd546 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -4,20 +4,18 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.types cimport null_order, size_type +from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table -import pylibcudf - -from pylibcudf cimport Scalar +import pylibcudf as plc @acquire_spill_lock() def count_elements(Column col): return Column.from_pylibcudf( - pylibcudf.lists.count_elements( + plc.lists.count_elements( col.to_pylibcudf(mode="read")) ) @@ -25,8 +23,8 @@ def count_elements(Column col): @acquire_spill_lock() def explode_outer(list source_columns, int explode_column_idx): return columns_from_pylibcudf_table( - pylibcudf.lists.explode_outer( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), + plc.lists.explode_outer( + plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]), explode_column_idx, ) ) @@ -35,10 +33,18 @@ def explode_outer(list source_columns, int explode_column_idx): @acquire_spill_lock() def distinct(Column col, bool nulls_equal, bool nans_all_equal): return Column.from_pylibcudf( - pylibcudf.lists.distinct( + plc.lists.distinct( col.to_pylibcudf(mode="read"), - nulls_equal, - nans_all_equal, + ( + plc.types.NullEquality.EQUAL + if nulls_equal + else plc.types.NullEquality.UNEQUAL + ), + ( + plc.types.NanEquality.ALL_EQUAL + if nans_all_equal + else plc.types.NanEquality.UNEQUAL + ), ) ) @@ -46,10 +52,14 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): @acquire_spill_lock() def sort_lists(Column col, bool ascending, str na_position): return Column.from_pylibcudf( - pylibcudf.lists.sort_lists( + plc.lists.sort_lists( col.to_pylibcudf(mode="read"), - ascending, - null_order.BEFORE if na_position == "first" else null_order.AFTER, + plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING, + ( + plc.types.NullOrder.BEFORE + if na_position == "first" + else plc.types.NullOrder.AFTER + ), False, ) ) @@ -58,7 +68,7 @@ def sort_lists(Column col, bool ascending, str na_position): @acquire_spill_lock() def extract_element_scalar(Column col, size_type index): return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( + plc.lists.extract_list_element( col.to_pylibcudf(mode="read"), index, ) @@ -68,7 +78,7 @@ def extract_element_scalar(Column col, size_type index): @acquire_spill_lock() def extract_element_column(Column col, Column index): return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( + plc.lists.extract_list_element( col.to_pylibcudf(mode="read"), index.to_pylibcudf(mode="read"), ) @@ -78,9 +88,9 @@ def extract_element_column(Column col, Column index): @acquire_spill_lock() def contains_scalar(Column col, py_search_key): return Column.from_pylibcudf( - pylibcudf.lists.contains( + plc.lists.contains( col.to_pylibcudf(mode="read"), - py_search_key.device_value.c_value, + py_search_key.device_value.c_value, ) ) @@ -88,10 +98,10 @@ def contains_scalar(Column col, py_search_key): @acquire_spill_lock() def index_of_scalar(Column col, object py_search_key): return Column.from_pylibcudf( - pylibcudf.lists.index_of( + plc.lists.index_of( col.to_pylibcudf(mode="read"), - py_search_key.device_value.c_value, - True, + py_search_key.device_value.c_value, + plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -99,10 +109,10 @@ def index_of_scalar(Column col, object py_search_key): @acquire_spill_lock() def index_of_column(Column col, Column search_keys): return Column.from_pylibcudf( - pylibcudf.lists.index_of( + plc.lists.index_of( col.to_pylibcudf(mode="read"), search_keys.to_pylibcudf(mode="read"), - True, + plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -110,8 +120,8 @@ def index_of_column(Column col, Column search_keys): @acquire_spill_lock() def concatenate_rows(list source_columns): return Column.from_pylibcudf( - pylibcudf.lists.concatenate_rows( - pylibcudf.Table([ + plc.lists.concatenate_rows( + plc.Table([ c.to_pylibcudf(mode="read") for c in source_columns ]) ) @@ -121,8 +131,20 @@ def concatenate_rows(list source_columns): @acquire_spill_lock() def concatenate_list_elements(Column input_column, dropna=False): return Column.from_pylibcudf( - pylibcudf.lists.concatenate_list_elements( + plc.lists.concatenate_list_elements( input_column.to_pylibcudf(mode="read"), - dropna, + plc.lists.ConcatenateNullPolicy.IGNORE + if dropna + else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, + ) + ) + + +@acquire_spill_lock() +def segmented_gather(Column source_column, Column gather_map): + return Column.from_pylibcudf( + plc.lists.segmented_gather( + source_column.to_pylibcudf(mode="read"), + gather_map.to_pylibcudf(mode="read"), ) ) diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx index 0d768e24f39..2b2762eead2 100644 --- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx +++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx @@ -3,49 +3,22 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.byte_pair_encode cimport ( - bpe_merge_pairs as cpp_bpe_merge_pairs, - byte_pair_encoding as cpp_byte_pair_encoding, - load_merge_pairs as cpp_load_merge_pairs, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -cdef class BPEMergePairs: - cdef unique_ptr[cpp_bpe_merge_pairs] c_obj - - def __cinit__(self, Column merge_pairs): - cdef column_view c_pairs = merge_pairs.view() - with nogil: - self.c_obj = move(cpp_load_merge_pairs(c_pairs)) +from pylibcudf import nvtext +from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs # no-cython-lint @acquire_spill_lock() def byte_pair_encoding( Column strings, - BPEMergePairs merge_pairs, + object merge_pairs, object separator ): - cdef column_view c_strings = strings.view() - cdef DeviceScalar d_separator = separator.device_value - cdef const string_scalar* c_separator = d_separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_byte_pair_encoding( - c_strings, - merge_pairs.c_obj.get()[0], - c_separator[0] - ) + return Column.from_pylibcudf( + nvtext.byte_pair_encode.byte_pair_encoding( + strings.to_pylibcudf(mode="read"), + merge_pairs, + separator.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx index e3c2273345a..3dd99c42d76 100644 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx @@ -2,37 +2,23 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.edit_distance cimport ( - edit_distance as cpp_edit_distance, - edit_distance_matrix as cpp_edit_distance_matrix, -) +from pylibcudf cimport nvtext from cudf._lib.column cimport Column @acquire_spill_lock() def edit_distance(Column strings, Column targets): - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance(c_strings, c_targets)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance( + strings.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def edit_distance_matrix(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance_matrix(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance_matrix( + strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index 6591b527eec..7fdf9258b7f 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -2,75 +2,34 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, - hash_character_ngrams as cpp_hash_character_ngrams, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def generate_ngrams(Column strings, int ngrams, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - c_ngrams, - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams, + py_separator.device_value.c_value + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def generate_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def hash_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_hash_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.hash_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx index 0ebf7c281e3..c964d0206b7 100644 --- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx +++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx @@ -2,33 +2,16 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.jaccard cimport ( - jaccard_index as cpp_jaccard_index, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column +from pylibcudf import nvtext + @acquire_spill_lock() def jaccard_index(Column input1, Column input2, int width): - cdef column_view c_input1 = input1.view() - cdef column_view c_input2 = input2.view() - cdef size_type c_width = width - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_jaccard_index( - c_input1, - c_input2, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.jaccard.jaccard_index( + input1.to_pylibcudf(mode="read"), + input2.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 59cb8d51440..25cfcf99ca6 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -1,94 +1,73 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +from libc.stdint cimport uint32_t, uint64_t -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.minhash cimport ( - minhash as cpp_minhash, - minhash64 as cpp_minhash64, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, -) -from pylibcudf.libcudf.types cimport size_type +from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column +from pylibcudf import nvtext -@acquire_spill_lock() -def minhash(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash( - c_strings, - c_seeds, - c_width - ) - ) - return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def minhash(Column input, Column seeds, int width=4): + result = nvtext.minhash.minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() -def minhash64(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash64( - c_strings, - c_seeds, - c_width - ) +def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() -def word_minhash(Column input, Column seeds): +def minhash64(Column input, Column seeds, int width=4): + result = nvtext.minhash.minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_word_minhash( - c_input, - c_seeds - ) +@acquire_spill_lock() +def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash64_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() -def word_minhash64(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result +def word_minhash(Column input, Column seeds): + result = nvtext.minhash.word_minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(result) - with nogil: - c_result = move( - cpp_word_minhash64( - c_input, - c_seeds - ) - ) - return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def word_minhash64(Column input, Column seeds): + result = nvtext.minhash.word_minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx index dec4f037d98..c125d92a24e 100644 --- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx @@ -2,48 +2,23 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport ( - ngrams_tokenize as cpp_ngrams_tokenize, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def ngrams_tokenize( - Column strings, + Column input, int ngrams, object py_delimiter, object py_separator ): - - cdef DeviceScalar delimiter = py_delimiter.device_value - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_ngrams_tokenize( - c_strings, - c_ngrams, - c_delimiter[0], - c_separator[0] - ) + return Column.from_pylibcudf( + nvtext.ngrams_tokenize.ngrams_tokenize( + input.to_pylibcudf(mode="read"), + ngrams, + py_delimiter.device_value.c_value, + py_separator.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx index 5e86a9ce959..cc45123dd0a 100644 --- a/python/cudf/cudf/_lib/nvtext/normalize.pyx +++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx @@ -3,36 +3,26 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) from cudf._lib.column cimport Column - -@acquire_spill_lock() -def normalize_spaces(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_normalize_spaces(c_strings)) - - return Column.from_unique_ptr(move(c_result)) +from pylibcudf import nvtext @acquire_spill_lock() -def normalize_characters(Column strings, bool do_lower=True): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result +def normalize_spaces(Column input): + return Column.from_pylibcudf( + nvtext.normalize.normalize_spaces( + input.to_pylibcudf(mode="read") + ) + ) - with nogil: - c_result = move(cpp_normalize_characters(c_strings, do_lower)) - return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def normalize_characters(Column input, bool do_lower=True): + return Column.from_pylibcudf( + nvtext.normalize.normalize_characters( + input.to_pylibcudf(mode="read"), + do_lower, + ) + ) diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx index 61ae3da5782..bec56ade83c 100644 --- a/python/cudf/cudf/_lib/nvtext/replace.pyx +++ b/python/cudf/cudf/_lib/nvtext/replace.pyx @@ -2,20 +2,10 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.replace cimport ( - filter_tokens as cpp_filter_tokens, - replace_tokens as cpp_replace_tokens, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar +from pylibcudf import nvtext @acquire_spill_lock() @@ -30,27 +20,14 @@ def replace_tokens(Column strings, provided. """ - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef column_view c_replacements = replacements.view() - - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_replace_tokens( - c_strings, - c_targets, - c_replacements, - c_delimiter[0], - ) + return Column.from_pylibcudf( + nvtext.replace.replace_tokens( + strings.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() @@ -65,24 +42,11 @@ def filter_tokens(Column strings, character provided. """ - cdef DeviceScalar replacement = py_replacement.device_value - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_repl = replacement\ - .get_raw_ptr() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_filter_tokens( - c_strings, - min_token_length, - c_repl[0], - c_delimiter[0], - ) + return Column.from_pylibcudf( + nvtext.replace.filter_tokens( + strings.to_pylibcudf(mode="read"), + min_token_length, + py_replacement.device_value.c_value, + py_delimiter.device_value.c_value, ) - - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx index 5bf25562fed..63a389b64d5 100644 --- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx +++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx @@ -1,24 +1,19 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from enum import IntEnum -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view +from cudf.core.buffer import acquire_spill_lock + from pylibcudf.libcudf.nvtext.stemmer cimport ( - is_letter as cpp_is_letter, letter_type, - porter_stemmer_measure as cpp_porter_stemmer_measure, underlying_type_t_letter_type, ) from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +from pylibcudf import nvtext + class LetterType(IntEnum): CONSONANT = letter_type.CONSONANT @@ -27,43 +22,34 @@ class LetterType(IntEnum): @acquire_spill_lock() def porter_stemmer_measure(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_porter_stemmer_measure(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + nvtext.stemmer.porter_stemmer_measure( + strings.to_pylibcudf(mode="read"), + ) + ) @acquire_spill_lock() def is_letter(Column strings, object ltype, size_type index): - cdef column_view c_strings = strings.view() - cdef letter_type c_ltype = ( - ltype + return Column.from_pylibcudf( + nvtext.stemmer.is_letter( + strings.to_pylibcudf(mode="read"), + ltype==LetterType.VOWEL, + index, + ) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_is_letter(c_strings, c_ltype, index)) - - return Column.from_unique_ptr(move(c_result)) @acquire_spill_lock() def is_letter_multi(Column strings, object ltype, Column indices): - cdef column_view c_strings = strings.view() - cdef column_view c_indices = indices.view() - cdef letter_type c_ltype = ( - ltype + return Column.from_pylibcudf( + nvtext.stemmer.is_letter( + strings.to_pylibcudf(mode="read"), + ltype==LetterType.VOWEL, + indices.to_pylibcudf(mode="read"), + ) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices)) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx index ee442ece5c6..5e0bfb74705 100644 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx @@ -5,35 +5,16 @@ from libc.stdint cimport uint32_t from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( - hashed_vocabulary as cpp_hashed_vocabulary, - load_vocabulary_file as cpp_load_vocabulary_file, - move as tr_move, - subword_tokenize as cpp_subword_tokenize, - tokenizer_result as cpp_tokenizer_result, -) from cudf._lib.column cimport Column - -cdef class Hashed_Vocabulary: - cdef unique_ptr[cpp_hashed_vocabulary] c_obj - - def __cinit__(self, hash_file): - cdef string c_hash_file = str(hash_file).encode() - with nogil: - self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) +from pylibcudf import nvtext @acquire_spill_lock() def subword_tokenize_inmem_hash( Column strings, - Hashed_Vocabulary hashed_vocabulary, + object hashed_vocabulary, uint32_t max_sequence_length=64, uint32_t stride=48, bool do_lower=True, @@ -42,21 +23,16 @@ def subword_tokenize_inmem_hash( """ Subword tokenizes text series by using the pre-loaded hashed vocabulary """ - cdef column_view c_strings = strings.view() - cdef cpp_tokenizer_result c_result - with nogil: - c_result = tr_move( - cpp_subword_tokenize( - c_strings, - hashed_vocabulary.c_obj.get()[0], - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - ) + result = nvtext.subword_tokenize.subword_tokenize( + strings.to_pylibcudf(mode="read"), + hashed_vocabulary, + max_sequence_length, + stride, + do_lower, + do_truncate, + ) # return the 3 tensor components - tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids)) - masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask)) - metadata = Column.from_unique_ptr(move(c_result.tensor_metadata)) + tokens = Column.from_pylibcudf(result[0]) + masks = Column.from_pylibcudf(result[1]) + metadata = Column.from_pylibcudf(result[2]) return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx index a7e63f1e9ae..f473c48e2f7 100644 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx @@ -2,162 +2,85 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.tokenize cimport ( - character_tokenize as cpp_character_tokenize, - count_tokens as cpp_count_tokens, - detokenize as cpp_detokenize, - load_vocabulary as cpp_load_vocabulary, - tokenize as cpp_tokenize, - tokenize_vocabulary as cpp_tokenize_vocabulary, - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint + from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def _tokenize_scalar(Column strings, object py_delimiter): - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize( - c_strings, - c_delimiter[0], - ) + return Column.from_pylibcudf( + nvtext.tokenize.tokenize_scalar( + strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def _tokenize_column(Column strings, Column delimiters): - cdef column_view c_strings = strings.view() - cdef column_view c_delimiters = delimiters.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize( - c_strings, - c_delimiters - ) + return Column.from_pylibcudf( + nvtext.tokenize.tokenize_column( + strings.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def _count_tokens_scalar(Column strings, object py_delimiter): - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_count_tokens( - c_strings, - c_delimiter[0] - ) + return Column.from_pylibcudf( + nvtext.tokenize.count_tokens_scalar( + strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def _count_tokens_column(Column strings, Column delimiters): - cdef column_view c_strings = strings.view() - cdef column_view c_delimiters = delimiters.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_count_tokens( - c_strings, - c_delimiters - ) + return Column.from_pylibcudf( + nvtext.tokenize.count_tokens_column( + strings.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read") ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def character_tokenize(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_character_tokenize(c_strings) + return Column.from_pylibcudf( + nvtext.tokenize.character_tokenize( + strings.to_pylibcudf(mode="read") ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def detokenize(Column strings, Column indices, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef column_view c_indices = indices.view() - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_detokenize(c_strings, c_indices, c_separator[0]) + return Column.from_pylibcudf( + nvtext.tokenize.detokenize( + strings.to_pylibcudf(mode="read"), + indices.to_pylibcudf(mode="read"), + py_separator.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) - - -cdef class TokenizeVocabulary: - cdef unique_ptr[cpp_tokenize_vocabulary] c_obj - - def __cinit__(self, Column vocab): - cdef column_view c_vocab = vocab.view() - with nogil: - self.c_obj = move(cpp_load_vocabulary(c_vocab)) + ) @acquire_spill_lock() def tokenize_with_vocabulary(Column strings, - TokenizeVocabulary vocabulary, + object vocabulary, object py_delimiter, size_type default_id): - - cdef DeviceScalar delimiter = py_delimiter.device_value - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize_with_vocabulary( - c_strings, - vocabulary.c_obj.get()[0], - c_delimiter[0], - default_id - ) + return Column.from_pylibcudf( + nvtext.tokenize.tokenize_with_vocabulary( + strings.to_pylibcudf(mode="read"), + vocabulary, + py_delimiter.device_value.c_value, + default_id ) - - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index f88c48ce989..32a5e463916 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -15,7 +15,6 @@ try: except ImportError: import json -cimport pylibcudf.libcudf.io.types as cudf_io_types cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.orc cimport ( @@ -26,7 +25,6 @@ from pylibcudf.libcudf.io.orc cimport ( ) from pylibcudf.libcudf.io.types cimport ( column_in_metadata, - compression_type, sink_info, table_input_metadata, ) @@ -137,22 +135,23 @@ cpdef read_orc(object filepaths_or_buffers, return data, index -cdef compression_type _get_comp_type(object compression): +def _get_comp_type(object compression): if compression is None or compression is False: - return compression_type.NONE + return plc.io.types.CompressionType.NONE compression = str(compression).upper() if compression == "SNAPPY": - return compression_type.SNAPPY + return plc.io.types.CompressionType.SNAPPY elif compression == "ZLIB": - return compression_type.ZLIB + return plc.io.types.CompressionType.ZLIB elif compression == "ZSTD": - return compression_type.ZSTD + return plc.io.types.CompressionType.ZSTD elif compression == "LZ4": - return compression_type.LZ4 + return plc.io.types.CompressionType.LZ4 else: raise ValueError(f"Unsupported `compression` type {compression}") + cdef tuple _get_index_from_metadata( vector[map[string, string]] user_data, object names, @@ -210,7 +209,8 @@ cdef tuple _get_index_from_metadata( range_idx ) -cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics): + +def _get_orc_stat_freq(str statistics): """ Convert ORC statistics terms to CUDF convention: - ORC "STRIPE" == CUDF "ROWGROUP" @@ -218,11 +218,11 @@ cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics): """ statistics = str(statistics).upper() if statistics == "NONE": - return cudf_io_types.statistics_freq.STATISTICS_NONE + return plc.io.types.StatisticsFreq.STATISTICS_NONE elif statistics == "STRIPE": - return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP + return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP elif statistics == "ROWGROUP": - return cudf_io_types.statistics_freq.STATISTICS_PAGE + return plc.io.types.StatisticsFreq.STATISTICS_PAGE else: raise ValueError(f"Unsupported `statistics_freq` type {statistics}") @@ -232,7 +232,7 @@ def write_orc( table, object path_or_buf, object compression="snappy", - object statistics="ROWGROUP", + str statistics="ROWGROUP", object stripe_size_bytes=None, object stripe_size_rows=None, object row_index_stride=None, @@ -246,7 +246,6 @@ def write_orc( -------- cudf.read_orc """ - cdef compression_type compression_ = _get_comp_type(compression) cdef unique_ptr[data_sink] data_sink_c cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) cdef table_input_metadata tbl_meta @@ -289,7 +288,7 @@ def write_orc( sink_info_c, tv ).metadata(tbl_meta) .key_value_metadata(move(user_data)) - .compression(compression_) + .compression(_get_comp_type(compression)) .enable_statistics(_get_orc_stat_freq(statistics)) .build() ) @@ -330,8 +329,8 @@ cdef class ORCWriter: cdef unique_ptr[orc_chunked_writer] writer cdef sink_info sink cdef unique_ptr[data_sink] _data_sink - cdef cudf_io_types.statistics_freq stat_freq - cdef compression_type comp_type + cdef str statistics + cdef object compression cdef object index cdef table_input_metadata tbl_meta cdef object cols_as_map_type @@ -343,15 +342,15 @@ cdef class ORCWriter: object path, object index=None, object compression="snappy", - object statistics="ROWGROUP", + str statistics="ROWGROUP", object cols_as_map_type=None, object stripe_size_bytes=None, object stripe_size_rows=None, object row_index_stride=None): self.sink = make_sink_info(path, self._data_sink) - self.stat_freq = _get_orc_stat_freq(statistics) - self.comp_type = _get_comp_type(compression) + self.statistics = statistics + self.compression = compression self.index = index self.cols_as_map_type = cols_as_map_type \ if cols_as_map_type is None else set(cols_as_map_type) @@ -429,8 +428,8 @@ cdef class ORCWriter: chunked_orc_writer_options.builder(self.sink) .metadata(self.tbl_meta) .key_value_metadata(move(user_data)) - .compression(self.comp_type) - .enable_statistics(self.stat_freq) + .compression(_get_comp_type(self.compression)) + .enable_statistics(_get_orc_stat_freq(self.statistics)) .build() ) if self.stripe_size_bytes is not None: diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index fa2690c7f21..d4bd0cd306c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -27,14 +27,12 @@ from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string -from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move from libcpp.vector cimport vector -cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink -cimport pylibcudf.libcudf.io.types as cudf_io_types from pylibcudf.expressions cimport Expression from pylibcudf.io.parquet cimport ChunkedParquetReader +from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.parquet cimport ( chunked_parquet_writer_options, merge_row_group_metadata as parquet_merge_metadata, @@ -42,13 +40,14 @@ from pylibcudf.libcudf.io.parquet cimport ( parquet_writer_options, write_parquet as parquet_writer, ) -from pylibcudf.libcudf.io.parquet_metadata cimport ( - parquet_metadata, - read_parquet_metadata as parquet_metadata_reader, -) from pylibcudf.libcudf.io.types cimport ( + sink_info, column_in_metadata, table_input_metadata, + partition_info, + statistics_freq, + compression_type, + dictionary_policy, ) from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type @@ -57,7 +56,6 @@ from cudf._lib.column cimport Column from cudf._lib.io.utils cimport ( add_df_col_struct_names, make_sinks_info, - make_source_info, ) from cudf._lib.utils cimport table_view_from_table @@ -368,7 +366,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, nrows=nrows, skip_rows=skip_rows) return df -cpdef read_parquet_metadata(filepaths_or_buffers): +cpdef read_parquet_metadata(list filepaths_or_buffers): """ Cython function to call into libcudf API, see `read_parquet_metadata`. @@ -377,56 +375,40 @@ cpdef read_parquet_metadata(filepaths_or_buffers): cudf.io.parquet.read_parquet cudf.io.parquet.to_parquet """ - cdef cudf_io_types.source_info source = make_source_info(filepaths_or_buffers) - - args = move(source) - - cdef parquet_metadata c_result - - # Read Parquet metadata - with nogil: - c_result = move(parquet_metadata_reader(args)) - - # access and return results - num_rows = c_result.num_rows() - num_rowgroups = c_result.num_rowgroups() - - # extract row group metadata and sanitize keys - row_group_metadata = [{k.decode(): v for k, v in metadata} - for metadata in c_result.rowgroup_metadata()] + parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( + plc.io.SourceInfo(filepaths_or_buffers) + ) # read all column names including index column, if any - col_names = [info.name().decode() for info in c_result.schema().root().children()] + col_names = [info.name() for info in parquet_metadata.schema().root().children()] - # access the Parquet file_footer to find the index - index_col = None - cdef unordered_map[string, string] file_footer = c_result.metadata() - - # get index column name(s) - index_col_names = None - json_str = file_footer[b'pandas'].decode('utf-8') - meta = None + index_col_names = set() + json_str = parquet_metadata.metadata()['pandas'] if json_str != "": meta = json.loads(json_str) file_is_range_index, index_col, _ = _parse_metadata(meta) - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} + if ( + not file_is_range_index + and index_col is not None + ): + columns = meta['columns'] for idx_col in index_col: - for c in meta['columns']: + for c in columns: if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] + index_col_names.add(idx_col) # remove the index column from the list of column names # only if index_col_names is not None - if index_col_names is not None: + if len(index_col_names) >= 0: col_names = [name for name in col_names if name not in index_col_names] - # num_columns = length of list(col_names) - num_columns = len(col_names) - - # return the metadata - return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata + return ( + parquet_metadata.num_rows(), + parquet_metadata.num_rowgroups(), + col_names, + len(col_names), + parquet_metadata.rowgroup_metadata() + ) @acquire_spill_lock() @@ -466,8 +448,8 @@ def write_parquet( cdef vector[map[string, string]] user_data cdef table_view tv - cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sinks - cdef cudf_io_types.sink_info sink = make_sinks_info( + cdef vector[unique_ptr[data_sink]] _data_sinks + cdef sink_info sink = make_sinks_info( filepaths_or_buffers, _data_sinks ) @@ -531,19 +513,19 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - cdef cudf_io_types.dictionary_policy dict_policy = ( - cudf_io_types.dictionary_policy.ADAPTIVE + dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE if use_dictionary - else cudf_io_types.dictionary_policy.NEVER + else plc.io.types.DictionaryPolicy.NEVER ) - cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression) - cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics) + comp_type = _get_comp_type(compression) + stat_freq = _get_stat_freq(statistics) cdef unique_ptr[vector[uint8_t]] out_metadata_c cdef vector[string] c_column_chunks_file_paths cdef bool _int96_timestamps = int96_timestamps - cdef vector[cudf_io_types.partition_info] partitions + cdef vector[partition_info] partitions # Perform write cdef parquet_writer_options args = move( @@ -563,7 +545,7 @@ def write_parquet( partitions.reserve(len(partitions_info)) for part in partitions_info: partitions.push_back( - cudf_io_types.partition_info(part[0], part[1]) + partition_info(part[0], part[1]) ) args.set_partitions(move(partitions)) if metadata_file_path is not None: @@ -646,17 +628,17 @@ cdef class ParquetWriter: cdef bool initialized cdef unique_ptr[cpp_parquet_chunked_writer] writer cdef table_input_metadata tbl_meta - cdef cudf_io_types.sink_info sink - cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sink - cdef cudf_io_types.statistics_freq stat_freq - cdef cudf_io_types.compression_type comp_type + cdef sink_info sink + cdef vector[unique_ptr[data_sink]] _data_sink + cdef str statistics + cdef object compression cdef object index cdef size_t row_group_size_bytes cdef size_type row_group_size_rows cdef size_t max_page_size_bytes cdef size_type max_page_size_rows cdef size_t max_dictionary_size - cdef cudf_io_types.dictionary_policy dict_policy + cdef bool use_dictionary cdef bool write_arrow_schema def __cinit__(self, object filepath_or_buffer, object index=None, @@ -674,8 +656,8 @@ cdef class ParquetWriter: else [filepath_or_buffer] ) self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink) - self.stat_freq = _get_stat_freq(statistics) - self.comp_type = _get_comp_type(compression) + self.statistics = statistics + self.compression = compression self.index = index self.initialized = False self.row_group_size_bytes = row_group_size_bytes @@ -683,11 +665,7 @@ cdef class ParquetWriter: self.max_page_size_bytes = max_page_size_bytes self.max_page_size_rows = max_page_size_rows self.max_dictionary_size = max_dictionary_size - self.dict_policy = ( - cudf_io_types.dictionary_policy.ADAPTIVE - if use_dictionary - else cudf_io_types.dictionary_policy.NEVER - ) + self.use_dictionary = use_dictionary self.write_arrow_schema = store_schema def write_table(self, table, object partitions_info=None): @@ -706,11 +684,11 @@ cdef class ParquetWriter: else: tv = table_view_from_table(table, ignore_index=True) - cdef vector[cudf_io_types.partition_info] partitions + cdef vector[partition_info] partitions if partitions_info is not None: for part in partitions_info: partitions.push_back( - cudf_io_types.partition_info(part[0], part[1]) + partition_info(part[0], part[1]) ) with nogil: @@ -795,13 +773,20 @@ cdef class ParquetWriter: user_data = vector[map[string, string]](num_partitions, tmp_user_data) cdef chunked_parquet_writer_options args + cdef compression_type comp_type = _get_comp_type(self.compression) + cdef statistics_freq stat_freq = _get_stat_freq(self.statistics) + cdef dictionary_policy dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE + if self.use_dictionary + else plc.io.types.DictionaryPolicy.NEVER + ) with nogil: args = move( chunked_parquet_writer_options.builder(self.sink) .metadata(self.tbl_meta) .key_value_metadata(move(user_data)) - .compression(self.comp_type) - .stats_level(self.stat_freq) + .compression(comp_type) + .stats_level(stat_freq) .row_group_size_bytes(self.row_group_size_bytes) .row_group_size_rows(self.row_group_size_rows) .max_page_size_bytes(self.max_page_size_bytes) @@ -810,7 +795,7 @@ cdef class ParquetWriter: .write_arrow_schema(self.write_arrow_schema) .build() ) - args.set_dictionary_policy(self.dict_policy) + args.set_dictionary_policy(dict_policy) self.writer.reset(new cpp_parquet_chunked_writer(args)) self.initialized = True @@ -838,56 +823,28 @@ cpdef merge_filemetadata(object filemetadata_list): return np.asarray(out_metadata_py) -cdef cudf_io_types.statistics_freq _get_stat_freq(object statistics): - statistics = str(statistics).upper() - if statistics == "NONE": - return cudf_io_types.statistics_freq.STATISTICS_NONE - elif statistics == "ROWGROUP": - return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP - elif statistics == "PAGE": - return cudf_io_types.statistics_freq.STATISTICS_PAGE - elif statistics == "COLUMN": - return cudf_io_types.statistics_freq.STATISTICS_COLUMN - else: +cdef statistics_freq _get_stat_freq(str statistics): + result = getattr( + plc.io.types.StatisticsFreq, + f"STATISTICS_{statistics.upper()}", + None + ) + if result is None: raise ValueError("Unsupported `statistics_freq` type") + return result -cdef cudf_io_types.compression_type _get_comp_type(object compression): +cdef compression_type _get_comp_type(object compression): if compression is None: - return cudf_io_types.compression_type.NONE - - compression = str(compression).upper() - if compression == "SNAPPY": - return cudf_io_types.compression_type.SNAPPY - elif compression == "ZSTD": - return cudf_io_types.compression_type.ZSTD - elif compression == "LZ4": - return cudf_io_types.compression_type.LZ4 - else: + return plc.io.types.CompressionType.NONE + result = getattr( + plc.io.types.CompressionType, + str(compression).upper(), + None + ) + if result is None: raise ValueError("Unsupported `compression` type") - - -cdef cudf_io_types.column_encoding _get_encoding_type(object encoding): - if encoding is None: - return cudf_io_types.column_encoding.USE_DEFAULT - - enc = str(encoding).upper() - if enc == "PLAIN": - return cudf_io_types.column_encoding.PLAIN - elif enc == "DICTIONARY": - return cudf_io_types.column_encoding.DICTIONARY - elif enc == "DELTA_BINARY_PACKED": - return cudf_io_types.column_encoding.DELTA_BINARY_PACKED - elif enc == "DELTA_LENGTH_BYTE_ARRAY": - return cudf_io_types.column_encoding.DELTA_LENGTH_BYTE_ARRAY - elif enc == "DELTA_BYTE_ARRAY": - return cudf_io_types.column_encoding.DELTA_BYTE_ARRAY - elif enc == "BYTE_STREAM_SPLIT": - return cudf_io_types.column_encoding.BYTE_STREAM_SPLIT - elif enc == "USE_DEFAULT": - return cudf_io_types.column_encoding.USE_DEFAULT - else: - raise ValueError("Unsupported `column_encoding` type") + return result cdef _set_col_metadata( @@ -914,7 +871,15 @@ cdef _set_col_metadata( col_meta.set_skip_compression(True) if column_encoding is not None and full_path in column_encoding: - col_meta.set_encoding(_get_encoding_type(column_encoding[full_path])) + encoding = column_encoding[full_path] + if encoding is None: + c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT + else: + enc = str(encoding).upper() + c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None) + if c_encoding is None: + raise ValueError("Unsupported `column_encoding` type") + col_meta.set_encoding(c_encoding) if column_type_length is not None and full_path in column_type_length: col_meta.set_output_as_binary(True) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 7666b7ff8da..509cfe5e9f8 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -6,14 +6,6 @@ from libcpp cimport bool from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_sorted, -) - -from cudf._lib.types import Interpolation - -from pylibcudf.libcudf.types cimport interpolation, sorted from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -28,17 +20,13 @@ def quantile( Column ordered_indices, bool exact, ): - cdef interpolation c_interp = ( - Interpolation[interp.upper()] - ) - return Column.from_pylibcudf( plc.quantiles.quantile( input.to_pylibcudf(mode="read"), q, - c_interp, + plc.types.Interpolation[interp.upper()], ordered_indices.to_pylibcudf(mode="read"), - exact + exact ) ) @@ -51,22 +39,14 @@ def quantile_table( list column_order, list null_precedence, ): - - cdef interpolation c_interp = ( - interp - ) - cdef sorted c_is_input_sorted = ( - is_input_sorted - ) - return columns_from_pylibcudf_table( plc.quantiles.quantiles( plc.Table([ c.to_pylibcudf(mode="read") for c in source_columns ]), q, - c_interp, - c_is_input_sorted, + interp, + is_input_sorted, column_order, null_precedence ) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 27095ca02d4..0f9820ed1db 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -4,7 +4,7 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.scalar.scalar cimport scalar -from rmm._lib.memory_resource cimport DeviceMemoryResource +from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource cdef class DeviceScalar: diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0dde91316fb..56712402919 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -6,7 +6,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from libc.stdint cimport int64_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -25,25 +24,7 @@ cimport pylibcudf.libcudf.types as libcudf_types # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). from pylibcudf cimport Scalar as plc_Scalar -from pylibcudf.libcudf.scalar.scalar cimport ( - duration_scalar, - list_scalar, - scalar, - struct_scalar, - timestamp_scalar, -) -from pylibcudf.libcudf.wrappers.durations cimport ( - duration_ms, - duration_ns, - duration_s, - duration_us, -) -from pylibcudf.libcudf.wrappers.timestamps cimport ( - timestamp_ms, - timestamp_ns, - timestamp_s, - timestamp_us, -) +from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id @@ -284,62 +265,6 @@ cdef class DeviceScalar: ] -# TODO: Currently the only uses of this function and the one below are in -# _create_proxy_nat_scalar. See if that code path can be simplified to excise -# or at least simplify these implementations. -cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - - value = value if valid else 0 - - if dtype == "datetime64[s]": - s.reset( - new timestamp_scalar[timestamp_s](np.int64(value), valid) - ) - elif dtype == "datetime64[ms]": - s.reset( - new timestamp_scalar[timestamp_ms](np.int64(value), valid) - ) - elif dtype == "datetime64[us]": - s.reset( - new timestamp_scalar[timestamp_us](np.int64(value), valid) - ) - elif dtype == "datetime64[ns]": - s.reset( - new timestamp_scalar[timestamp_ns](np.int64(value), valid) - ) - else: - raise ValueError(f"dtype not supported: {dtype}") - -cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - - value = value if valid else 0 - - if dtype == "timedelta64[s]": - s.reset( - new duration_scalar[duration_s](np.int64(value), valid) - ) - elif dtype == "timedelta64[ms]": - s.reset( - new duration_scalar[duration_ms](np.int64(value), valid) - ) - elif dtype == "timedelta64[us]": - s.reset( - new duration_scalar[duration_us](np.int64(value), valid) - ) - elif dtype == "timedelta64[ns]": - s.reset( - new duration_scalar[duration_ns](np.int64(value), valid) - ) - else: - raise ValueError(f"dtype not supported: {dtype}") - - def as_device_scalar(val, dtype=None): if isinstance(val, (cudf.Scalar, DeviceScalar)): if dtype == val.dtype or dtype is None: @@ -361,22 +286,3 @@ def _is_null_host_scalar(slr): return True else: return False - - -def _create_proxy_nat_scalar(dtype): - cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar) - - dtype = cudf.dtype(dtype) - if dtype.char in 'mM': - nat = dtype.type('NaT').astype(dtype) - if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar( - ( result.c_value).c_obj, nat, dtype, True - ) - elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar( - ( result.c_value).c_obj, nat, dtype, True - ) - return result - else: - raise TypeError('NAT only valid for datetime and timedelta') diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 185552ede82..eefe37d9880 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -5,21 +5,10 @@ from itertools import repeat from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector from pylibcudf.libcudf.aggregation cimport rank_method -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.search cimport lower_bound, upper_bound -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport null_order, order as cpp_order - from cudf._lib.column cimport Column -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table import pylibcudf @@ -311,44 +300,19 @@ def digitize(list source_columns, list bins, bool right=False): right : Indicating whether the intervals include the right or the left bin edge. """ - - cdef table_view bins_view = table_view_from_columns(bins) - cdef table_view source_table_view = table_view_from_columns( - source_columns - ) - cdef vector[cpp_order] column_order = ( - vector[cpp_order]( - bins_view.num_columns(), - cpp_order.ASCENDING - ) - ) - cdef vector[null_order] null_precedence = ( - vector[null_order]( - bins_view.num_columns(), - null_order.BEFORE + return Column.from_pylibcudf( + getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")( + pylibcudf.Table( + [c.to_pylibcudf(mode="read") for c in bins] + ), + pylibcudf.Table( + [c.to_pylibcudf(mode="read") for c in source_columns] + ), + [pylibcudf.types.Order.ASCENDING]*len(bins), + [pylibcudf.types.NullOrder.BEFORE]*len(bins) ) ) - cdef unique_ptr[column] c_result - if right: - with nogil: - c_result = move(lower_bound( - bins_view, - source_table_view, - column_order, - null_precedence) - ) - else: - with nogil: - c_result = move(upper_bound( - bins_view, - source_table_view, - column_order, - null_precedence) - ) - - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def rank_columns(list source_columns, rank_method method, str na_option, diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 60a6795a402..06ee07d8e2b 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -2,80 +2,27 @@ from cudf._lib.column cimport Column -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( - from_booleans as cpp_from_booleans, - to_booleans as cpp_to_booleans, -) -from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - is_timestamp as cpp_is_timestamp, -) -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - from_floats as cpp_from_floats, - to_floats as cpp_to_floats, -) -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - from_integers as cpp_from_integers, - hex_to_integers as cpp_hex_to_integers, - integers_to_hex as cpp_integers_to_hex, - is_hex as cpp_is_hex, - to_integers as cpp_to_integers, -) -from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport ( - integers_to_ipv4 as cpp_integers_to_ipv4, - ipv4_to_integers as cpp_ipv4_to_integers, - is_ipv4 as cpp_is_ipv4, -) -from pylibcudf.libcudf.types cimport data_type, type_id - -from cudf._lib.types cimport underlying_type_t_type_id - import pylibcudf as plc +from pylibcudf.types cimport DataType -import cudf +from cudf._lib.scalar import as_device_scalar from cudf._lib.types cimport dtype_to_pylibcudf_type def floating_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_floats( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def string_to_floating(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_floats.from_floats( + input_col.to_pylibcudf(mode="read"), ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_floats( - input_column_view, - c_out_type)) + return Column.from_pylibcudf(plc_column) + - return Column.from_unique_ptr(move(c_result)) +def string_to_floating(Column input_col, DataType out_type): + plc_column = plc.strings.convert.convert_floats.to_floats( + input_col.to_pylibcudf(mode="read"), + out_type + ) + return Column.from_pylibcudf(plc_column) def dtos(Column input_col): @@ -107,7 +54,7 @@ def stod(Column input_col): A Column with strings cast to double """ - return string_to_floating(input_col, cudf.dtype("float64")) + return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64)) def ftos(Column input_col): @@ -139,36 +86,22 @@ def stof(Column input_col): A Column with strings cast to float """ - return string_to_floating(input_col, cudf.dtype("float32")) + return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32)) def integer_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_integers( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def string_to_integer(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_integers.from_integers( + input_col.to_pylibcudf(mode="read"), ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_integers( - input_column_view, - c_out_type)) + return Column.from_pylibcudf(plc_column) - return Column.from_unique_ptr(move(c_result)) + +def string_to_integer(Column input_col, DataType out_type): + plc_column = plc.strings.convert.convert_integers.to_integers( + input_col.to_pylibcudf(mode="read"), + out_type + ) + return Column.from_pylibcudf(plc_column) def i8tos(Column input_col): @@ -200,7 +133,7 @@ def stoi8(Column input_col): A Column with strings cast to int8 """ - return string_to_integer(input_col, cudf.dtype("int8")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8)) def i16tos(Column input_col): @@ -232,7 +165,7 @@ def stoi16(Column input_col): A Column with strings cast to int16 """ - return string_to_integer(input_col, cudf.dtype("int16")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16)) def itos(Column input_col): @@ -264,7 +197,7 @@ def stoi(Column input_col): A Column with strings cast to int32 """ - return string_to_integer(input_col, cudf.dtype("int32")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32)) def ltos(Column input_col): @@ -296,7 +229,7 @@ def stol(Column input_col): A Column with strings cast to int64 """ - return string_to_integer(input_col, cudf.dtype("int64")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64)) def ui8tos(Column input_col): @@ -328,7 +261,7 @@ def stoui8(Column input_col): A Column with strings cast to uint8 """ - return string_to_integer(input_col, cudf.dtype("uint8")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8)) def ui16tos(Column input_col): @@ -360,7 +293,7 @@ def stoui16(Column input_col): A Column with strings cast to uint16 """ - return string_to_integer(input_col, cudf.dtype("uint16")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16)) def uitos(Column input_col): @@ -392,7 +325,7 @@ def stoui(Column input_col): A Column with strings cast to uint32 """ - return string_to_integer(input_col, cudf.dtype("uint32")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32)) def ultos(Column input_col): @@ -424,80 +357,24 @@ def stoul(Column input_col): A Column with strings cast to uint64 """ - return string_to_integer(input_col, cudf.dtype("uint64")) - - -def _to_booleans(Column input_col, object string_true="True"): - """ - Converting/Casting input column of type string to boolean column - - Parameters - ---------- - input_col : input column of type string - string_true : string that represents True - - Returns - ------- - A Column with string values cast to boolean - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_booleans( - input_column_view, - string_scalar_true[0])) - - return Column.from_unique_ptr(move(c_result)) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64)) def to_booleans(Column input_col): - - return _to_booleans(input_col) - - -def _from_booleans( - Column input_col, - object string_true="True", - object string_false="False"): - """ - Converting/Casting input column of type boolean to string column - - Parameters - ---------- - input_col : input column of type boolean - string_true : string that represents True - string_false : string that represents False - - Returns - ------- - A Column with boolean values cast to string - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef DeviceScalar str_false = as_device_scalar(string_false) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef const string_scalar* string_scalar_false = ( - str_false.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_booleans( - input_column_view, - string_scalar_true[0], - string_scalar_false[0])) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_booleans.to_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + ) + return Column.from_pylibcudf(plc_column) def from_booleans(Column input_col): - return _from_booleans(input_col) + plc_column = plc.strings.convert.convert_booleans.from_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + as_device_scalar("False").c_value, + ) + return Column.from_pylibcudf(plc_column) def int2timestamp( @@ -520,11 +397,10 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef string c_timestamp_format = format.encode("UTF-8") return Column.from_pylibcudf( plc.strings.convert.convert_datetime.from_timestamps( input_col.to_pylibcudf(mode="read"), - c_timestamp_format, + format, names.to_pylibcudf(mode="read") ) ) @@ -545,12 +421,11 @@ def timestamp2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_datetime.to_timestamps( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -570,18 +445,11 @@ def istimestamp(Column input_col, str format): A Column of boolean values identifying strings that matched the format. """ - if input_col.size == 0: - return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) - cdef column_view input_column_view = input_col.view() - cdef string c_timestamp_format = str(format).encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_is_timestamp( - input_column_view, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_datetime.is_timestamp( + input_col.to_pylibcudf(mode="read"), + format + ) + return Column.from_pylibcudf(plc_column) def timedelta2int(Column input_col, dtype, format): @@ -599,12 +467,11 @@ def timedelta2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.to_durations( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -623,12 +490,10 @@ def int2timedelta(Column input_col, str format): A Column with Timedelta represented in string format """ - - cdef string c_duration_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.from_durations( input_col.to_pylibcudf(mode="read"), - c_duration_format + format ) ) @@ -646,14 +511,10 @@ def int2ip(Column input_col): A Column with integer represented in string ipv4 format """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_ipv4(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def ip2int(Column input_col): @@ -669,14 +530,10 @@ def ip2int(Column input_col): A Column with ipv4 represented as integer """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_ipv4_to_integers(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def is_ipv4(Column source_strings): @@ -685,18 +542,13 @@ def is_ipv4(Column source_strings): that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn where nnn is integer digits in [0,255]. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_ipv4( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.is_ipv4( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) -def htoi(Column input_col, **kwargs): +def htoi(Column input_col): """ Converting input column of type string having hex values to integer of out_type @@ -709,22 +561,11 @@ def htoi(Column input_col, **kwargs): ------- A Column of integers parsed from hexadecimal string values. """ - - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")] - ) + plc_column = plc.strings.convert.convert_integers.hex_to_integers( + input_col.to_pylibcudf(mode="read"), + plc.DataType(plc.TypeId.INT64) ) - cdef data_type c_out_type = data_type(tid) - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_hex_to_integers(input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) def is_hex(Column source_strings): @@ -732,15 +573,10 @@ def is_hex(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have hex characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_hex( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.is_hex( + source_strings.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) def itoh(Column input_col): @@ -756,11 +592,7 @@ def itoh(Column input_col): ------- A Column of strings with hexadecimal characters. """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_hex(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.integers_to_hex( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 4bf8a9b1a8f..4c0ec2d9ac5 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,6 +9,8 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, + minhash64_permuted, + minhash_permuted, word_minhash, word_minhash64, ) @@ -71,16 +73,9 @@ startswith_multiple, ) from cudf._lib.strings.find_multiple import find_multiple -from cudf._lib.strings.findall import findall -from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object -from cudf._lib.strings.padding import ( - SideType, - center, - ljust, - pad, - rjust, - zfill, -) +from cudf._lib.strings.findall import find_re, findall +from cudf._lib.strings.json import get_json_object +from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( insert, diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx index 376a6f8af97..a57ce29eb45 100644 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ b/python/cudf/cudf/_lib/strings/char_types.pyx @@ -1,23 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.char_types cimport ( - all_characters_of_type as cpp_all_characters_of_type, - filter_characters_of_type as cpp_filter_characters_of_type, - string_character_types, -) - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf.strings import char_types @acquire_spill_lock() @@ -25,26 +14,15 @@ def filter_alphanum(Column source_strings, object py_repl, bool keep=True): """ Returns a Column of strings keeping only alphanumeric character types. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() + plc_column = char_types.filter_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALL_TYPES if keep + else char_types.StringCharacterTypes.ALPHANUM, + py_repl.device_value.c_value, + char_types.StringCharacterTypes.ALPHANUM if keep + else char_types.StringCharacterTypes.ALL_TYPES ) - - with nogil: - c_result = move(cpp_filter_characters_of_type( - source_view, - string_character_types.ALL_TYPES if keep - else string_character_types.ALPHANUM, - scalar_repl[0], - string_character_types.ALPHANUM if keep - else string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -54,17 +32,12 @@ def is_decimal(Column source_strings): that contain only decimal characters -- those that can be used to extract base10 numbers. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DECIMAL, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.DECIMAL, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -75,17 +48,12 @@ def is_alnum(Column source_strings): Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHANUM, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALPHANUM, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -94,17 +62,12 @@ def is_alpha(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only alphabetic characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHA, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALPHA, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -113,17 +76,12 @@ def is_digit(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only decimal and digit characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DIGIT, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.DIGIT, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -133,17 +91,12 @@ def is_numeric(Column source_strings): that contain only numeric characters. These include digit and numeric characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.NUMERIC, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.NUMERIC, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -152,17 +105,12 @@ def is_upper(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only upper-case characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.UPPER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.UPPER, + char_types.StringCharacterTypes.CASE_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -171,17 +119,12 @@ def is_lower(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only lower-case characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.LOWER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.LOWER, + char_types.StringCharacterTypes.CASE_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -190,14 +133,9 @@ def is_space(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contains all characters which are spaces only. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.SPACE, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.SPACE, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 76cc13db0da..0f7b27d85d7 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -2,24 +2,11 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +from cudf._lib.column cimport Column -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.combine cimport ( - concatenate as cpp_concatenate, - join_list_elements as cpp_join_list_elements, - join_strings as cpp_join_strings, - output_if_empty_list, - separator_on_nulls, -) -from pylibcudf.libcudf.table.table_view cimport table_view +import pylibcudf as plc -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_columns +import cudf @acquire_spill_lock() @@ -31,26 +18,12 @@ def concatenate(list source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef table_view source_view = table_view_from_columns(source_strings) - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.concatenate( + plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]), + sep.device_value.c_value, + na_rep.device_value.c_value, ) - - with nogil: - c_result = move(cpp_concatenate( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -62,27 +35,12 @@ def join(Column source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_strings( + source_strings.to_pylibcudf(mode="read"), + sep.device_value.c_value, + na_rep.device_value.c_value, ) - - with nogil: - c_result = move(cpp_join_strings( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -96,29 +54,15 @@ def join_lists_with_scalar( between each string in lists and ``/`None` values are replaced by `py_narep` """ - - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_list_elements( + source_strings.to_pylibcudf(mode="read"), + py_separator.device_value.c_value, + py_narep.device_value.c_value, + cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - scalar_separator[0], - scalar_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -135,28 +79,12 @@ def join_lists_with_column( ``/`None` values in `separator_strings` are replaced by `py_separator_narep` """ - - cdef DeviceScalar source_narep = py_source_narep.device_value - cdef DeviceScalar separator_narep = py_separator_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view separator_view = separator_strings.view() - - cdef const string_scalar* scalar_source_narep = \ - (source_narep.get_raw_ptr()) - cdef const string_scalar* scalar_separator_narep = ( - separator_narep.get_raw_ptr() + plc_column = plc.strings.combine.join_list_elements( + source_strings.to_pylibcudf(mode="read"), + separator_strings.to_pylibcudf(mode="read"), + py_separator_narep.device_value.c_value, + py_source_narep.device_value.c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - separator_view, - scalar_separator_narep[0], - scalar_source_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index a8df8c9a92c..96dcd021c3b 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -1,22 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import cudf - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport ( - from_fixed_point as cpp_from_fixed_point, - is_fixed_point as cpp_is_fixed_point, - to_fixed_point as cpp_to_fixed_point, -) -from pylibcudf.libcudf.types cimport data_type, type_id - from cudf._lib.column cimport Column +from cudf._lib.types cimport dtype_to_pylibcudf_type + +import pylibcudf as plc @acquire_spill_lock() @@ -32,14 +21,10 @@ def from_decimal(Column input_col): ------- A column of strings representing the input decimal values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_fixed_point( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type): ------- A column of decimals parsed from the string values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef int scale = out_type.scale - cdef data_type c_out_type - if isinstance(out_type, cudf.Decimal32Dtype): - c_out_type = data_type(type_id.DECIMAL32, -scale) - elif isinstance(out_type, cudf.Decimal64Dtype): - c_out_type = data_type(type_id.DECIMAL64, -scale) - elif isinstance(out_type, cudf.Decimal128Dtype): - c_out_type = data_type(type_id.DECIMAL128, -scale) - else: - raise TypeError("should be a decimal dtype") - with nogil: - c_result = move( - cpp_to_fixed_point( - input_column_view, - c_out_type)) - - result = Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type), + ) + result = Column.from_pylibcudf(plc_column) result.dtype.precision = out_type.precision return result @@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype): ------- A Column of booleans indicating valid decimal conversion. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = input_col.view() - cdef int scale = dtype.scale - cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale) - with nogil: - c_result = move(cpp_is_fixed_point( - source_view, - c_dtype - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(dtype), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx index 7965b588703..5da6e3f10cc 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - is_float as cpp_is_float, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def is_float(Column source_strings): @@ -20,12 +13,7 @@ def is_float(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have floats. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_float( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_floats.is_float( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx index 8b6da2bfa1c..50113347ccb 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx @@ -1,15 +1,8 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - is_integer as cpp_is_integer, -) +import pylibcudf as plc from cudf._lib.column cimport Column @@ -20,12 +13,8 @@ def is_integer(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have integers. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_integer( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_integers.is_integer( + source_strings.to_pylibcudf(mode="read") + ) + ) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx index 73aebf8ab35..3a2cb4bd5c7 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx @@ -1,23 +1,13 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_lists cimport ( - format_list_column as cpp_format_list_column, -) - from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar -from cudf._lib.scalar cimport DeviceScalar - @acquire_spill_lock() def format_list_column(Column source_list, Column separators): @@ -34,19 +24,9 @@ def format_list_column(Column source_list, Column separators): ------- Formatted strings column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_list.view() - cdef column_view separators_view = separators.view() - # Use 'None' as null-replacement string - cdef DeviceScalar str_na_rep = as_device_scalar("None") - cdef const string_scalar* string_scalar_na_rep = ( - str_na_rep.get_raw_ptr()) - - with nogil: - c_result = move(cpp_format_list_column( - source_view, string_scalar_na_rep[0], separators_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_lists.format_list_column( + source_list.to_pylibcudf(mode="read"), + as_device_scalar("None").c_value, + separators.to_pylibcudf(mode="read"), ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx index e52116d6247..d5c2f771970 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx @@ -1,17 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_urls cimport ( - url_decode as cpp_url_decode, - url_encode as cpp_url_encode, -) - from cudf._lib.column cimport Column @@ -28,17 +20,10 @@ def url_decode(Column source_strings): ------- URL decoded string column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_url_decode( - source_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_urls.url_decode( + source_strings.to_pylibcudf(mode="read") ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,14 +42,7 @@ def url_encode(Column source_strings): ------- URL encoded string column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_url_encode( - source_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_urls.url_encode( + source_strings.to_pylibcudf(mode="read") ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx index 1358f8e3c2c..39e0013769f 100644 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.find_multiple cimport ( - find_multiple as cpp_find_multiple, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def find_multiple(Column source_strings, Column target_strings): @@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings): Returns a column with character position values where each of the `target_strings` are found in each string of `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - - with nogil: - c_result = move(cpp_find_multiple( - source_view, - target_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.find_multiple.find_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 0e758d5b322..3e7a504d535 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -23,3 +23,19 @@ def findall(Column source_strings, object pattern, uint32_t flags): prog, ) return Column.from_pylibcudf(plc_result) + + +@acquire_spill_lock() +def find_re(Column source_strings, object pattern, uint32_t flags): + """ + Returns character positions where the pattern first matches + the elements in source_strings. + """ + prog = plc.strings.regex_program.RegexProgram.create( + str(pattern), flags + ) + plc_result = plc.strings.findall.find_re( + source_strings.to_pylibcudf(mode="read"), + prog, + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx index c9b0bba088d..374a104635a 100644 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ b/python/cudf/cudf/_lib/strings/json.pyx @@ -1,84 +1,26 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc +from pylibcudf.json cimport GetJsonObjectOptions from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.json cimport ( - get_json_object as cpp_get_json_object, - get_json_object_options, -) - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar @acquire_spill_lock() def get_json_object( - Column col, object py_json_path, GetJsonObjectOptions options): + Column col, + object py_json_path, + GetJsonObjectOptions options +): """ Apply a JSONPath string to all rows in an input column of json strings. """ - cdef unique_ptr[column] c_result - - cdef column_view col_view = col.view() - cdef DeviceScalar json_path = py_json_path.device_value - - cdef const string_scalar* scalar_json_path = ( - json_path.get_raw_ptr() + plc_column = plc.json.get_json_object( + col.to_pylibcudf(mode="read"), + py_json_path.device_value.c_value, + options ) - - with nogil: - c_result = move(cpp_get_json_object( - col_view, - scalar_json_path[0], - options.options, - )) - - return Column.from_unique_ptr(move(c_result)) - - -cdef class GetJsonObjectOptions: - cdef get_json_object_options options - - def __init__( - self, - *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False - ): - self.options.set_allow_single_quotes(allow_single_quotes) - self.options.set_strip_quotes_from_single_strings( - strip_quotes_from_single_strings - ) - self.options.set_missing_fields_as_nulls(missing_fields_as_nulls) - - @property - def allow_single_quotes(self): - return self.options.get_allow_single_quotes() - - @property - def strip_quotes_from_single_strings(self): - return self.options.get_strip_quotes_from_single_strings() - - @property - def missing_fields_as_nulls(self): - return self.options.get_missing_fields_as_nulls() - - @allow_single_quotes.setter - def allow_single_quotes(self, val): - self.options.set_allow_single_quotes(val) - - @strip_quotes_from_single_strings.setter - def strip_quotes_from_single_strings(self, val): - self.options.set_strip_quotes_from_single_strings(val) - - @missing_fields_as_nulls.setter - def missing_fields_as_nulls(self, val): - self.options.set_missing_fields_as_nulls(val) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx index d0239e91ec3..015a2ebab8a 100644 --- a/python/cudf/cudf/_lib/strings/padding.pyx +++ b/python/cudf/cudf/_lib/strings/padding.pyx @@ -1,64 +1,31 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from enum import IntEnum - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.strings.padding cimport ( - pad as cpp_pad, - zfill as cpp_zfill, -) -from pylibcudf.libcudf.strings.side_type cimport ( - side_type, - underlying_type_t_side_type, -) - - -class SideType(IntEnum): - LEFT = side_type.LEFT - RIGHT = side_type.RIGHT - BOTH = side_type.BOTH +import pylibcudf as plc @acquire_spill_lock() def pad(Column source_strings, size_type width, fill_char, - side=SideType.LEFT): + side=plc.strings.side_type.SideType.LEFT): """ Returns a Column by padding strings in `source_strings` up to the given `width`. Direction of padding is to be specified by `side`. The additional characters being filled can be changed by specifying `fill_char`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - cdef side_type pad_direction = ( - side + plc_result = plc.strings.padding.pad( + source_strings.to_pylibcudf(mode="read"), + width, + side, + fill_char, ) - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - pad_direction, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -68,19 +35,13 @@ def zfill(Column source_strings, Returns a Column by prepending strings in `source_strings` with '0' characters up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_zfill( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.padding.zfill( + source_strings.to_pylibcudf(mode="read"), + width + ) + return Column.from_pylibcudf(plc_result) -@acquire_spill_lock() def center(Column source_strings, size_type width, fill_char): @@ -89,23 +50,9 @@ def center(Column source_strings, in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.BOTH, - f_char - )) + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH) - return Column.from_unique_ptr(move(c_result)) - -@acquire_spill_lock() def ljust(Column source_strings, size_type width, fill_char): @@ -113,23 +60,9 @@ def ljust(Column source_strings, Returns a Column by filling right side of strings in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT) - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.RIGHT, - f_char - )) - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() def rjust(Column source_strings, size_type width, fill_char): @@ -137,17 +70,4 @@ def rjust(Column source_strings, Returns a Column by filling left side of strings in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.LEFT, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT) diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx index fffc8b7c3f6..462d5c903e8 100644 --- a/python/cudf/cudf/_lib/strings/replace_re.pyx +++ b/python/cudf/cudf/_lib/strings/replace_re.pyx @@ -1,26 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector +from pylibcudf.libcudf.types cimport size_type +import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.replace_re cimport ( - replace_re as cpp_replace_re, - replace_with_backrefs as cpp_replace_with_backrefs, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar @acquire_spill_lock() @@ -34,28 +19,16 @@ def replace_re(Column source_strings, `n` indicates the number of resplacements to be made from start. (-1 indicates all) """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef const string_scalar* scalar_repl = \ - (repl.get_raw_ptr()) - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_replace_re( - source_view, - dereference(c_prog), - scalar_repl[0], - n - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.replace_re.replace_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT + ), + py_repl.device_value.c_value, + n + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -68,50 +41,29 @@ def replace_with_backrefs( new string with the extracted elements found using `pattern` regular expression in `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef string repl_string = str(repl).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_replace_with_backrefs( - source_view, - dereference(c_prog), - repl_string - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.replace_re.replace_with_backrefs( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT + ), + repl + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() def replace_multi_re(Column source_strings, - object patterns, + list patterns, Column repl_strings): """ Returns a Column after replacing occurrences of multiple regular expressions `patterns` with their corresponding strings in `repl_strings` in `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view repl_view = repl_strings.view() - - cdef int pattern_size = len(patterns) - cdef vector[string] patterns_vector - patterns_vector.reserve(pattern_size) - - for pattern in patterns: - patterns_vector.push_back(str.encode(pattern)) - - with nogil: - c_result = move(cpp_replace_re( - source_view, - patterns_vector, - repl_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.replace_re.replace_re( + source_strings.to_pylibcudf(mode="read"), + patterns, + repl_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index a81fb18e752..5319addc41c 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -1,21 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.split.partition cimport ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -25,25 +14,11 @@ def partition(Column source_strings, Returns data by splitting the `source_strings` column at the first occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_partition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.partition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -53,22 +28,8 @@ def rpartition(Column source_strings, Returns a Column by splitting the `source_strings` column at the last occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rpartition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.rpartition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index f481fea4c51..4ec6c7073d8 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -1,33 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.split.split cimport ( - rsplit as cpp_rsplit, - rsplit_re as cpp_rsplit_re, - rsplit_record as cpp_rsplit_record, - rsplit_record_re as cpp_rsplit_record_re, - split as cpp_split, - split_re as cpp_split_re, - split_record as cpp_split_record, - split_record_re as cpp_split_record_re, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -39,26 +18,12 @@ def split(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -70,25 +35,12 @@ def split_record(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -100,26 +52,12 @@ def rsplit(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -131,25 +69,12 @@ def rsplit_record(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -160,24 +85,15 @@ def split_re(Column source_strings, Returns data by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -189,24 +105,15 @@ def rsplit_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -217,23 +124,15 @@ def split_record_re(Column source_strings, Returns a Column by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index 38ecb21a94c..982c5a600e7 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -1,18 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.side_type cimport side_type -from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar import pylibcudf as plc @@ -24,15 +14,12 @@ def strip(Column source_strings, The set of characters need be stripped from left and right side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - return Column.from_pylibcudf( - plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.SideType.BOTH, - repl.c_value - ) + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.BOTH, + py_repl.device_value.c_value, ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -43,24 +30,12 @@ def lstrip(Column source_strings, The set of characters need be stripped from left side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.LEFT, + py_repl.device_value.c_value, ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.LEFT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -71,21 +46,9 @@ def rstrip(Column source_strings, The set of characters need be stripped from right side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.RIGHT, + py_repl.device_value.c_value, ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.RIGHT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx index 3fad91bbfc0..3ef478532c2 100644 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ b/python/cudf/cudf/_lib/strings/translate.pyx @@ -1,25 +1,12 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move -from libcpp.vector cimport vector from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.translate cimport ( - filter_characters as cpp_filter_characters, - filter_type, - translate as cpp_translate, -) -from pylibcudf.libcudf.types cimport char_utf8 - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +import pylibcudf as plc @acquire_spill_lock() @@ -29,30 +16,11 @@ def translate(Column source_strings, Translates individual characters within each string if present in the mapping_table. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - with nogil: - c_result = move(cpp_translate(source_view, c_mapping_table)) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.translate.translate( + source_strings.to_pylibcudf(mode="read"), + mapping_table, + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -64,44 +32,11 @@ def filter_characters(Column source_strings, Removes or keeps individual characters within each string using the provided mapping_table. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() + plc_result = plc.strings.translate.filter_characters( + source_strings.to_pylibcudf(mode="read"), + mapping_table, + plc.strings.translate.FilterType.KEEP + if keep else plc.strings.translate.FilterType.REMOVE, + py_repl.device_value.c_value ) - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - cdef filter_type c_keep - if keep is True: - c_keep = filter_type.KEEP - else: - c_keep = filter_type.REMOVE - - with nogil: - c_result = move(cpp_filter_characters( - source_view, - c_mapping_table, - c_keep, - scalar_repl[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx index eed5cf33b10..2b40f01f818 100644 --- a/python/cudf/cudf/_lib/strings/wrap.pyx +++ b/python/cudf/cudf/_lib/strings/wrap.pyx @@ -1,17 +1,13 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def wrap(Column source_strings, @@ -21,14 +17,8 @@ def wrap(Column source_strings, in the Column to be formatted in paragraphs with length less than a given `width`. """ - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_wrap( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.wrap.wrap( + source_strings.to_pylibcudf(mode="read"), + width + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index 78fc9f08bd8..dd2fafbe07f 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -23,7 +23,8 @@ from pylibcudf.libcudf.strings_udf cimport ( to_string_view_array as cpp_to_string_view_array, udf_string, ) -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.column cimport Column diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index b2c7232f549..7942d067c2b 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -1,33 +1,20 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from io import TextIOBase +from libcpp cimport bool -from cython.operator cimport dereference -from libc.stdint cimport uint64_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move +from io import TextIOBase -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.io.text cimport ( - byte_range_info, - data_chunk_source, - make_source, - make_source_from_bgzip_file, - make_source_from_file, - multibyte_split, - parse_options, -) +import pylibcudf as plc from cudf._lib.column cimport Column def read_text(object filepaths_or_buffers, - object delimiter=None, - object byte_range=None, - object strip_delimiters=False, - object compression=None, - object compression_offsets=None): + str delimiter, + object byte_range, + bool strip_delimiters, + object compression, + object compression_offsets): """ Cython function to call into libcudf API, see `multibyte_split`. @@ -35,24 +22,11 @@ def read_text(object filepaths_or_buffers, -------- cudf.io.text.read_text """ - cdef string delim = delimiter.encode() - - cdef unique_ptr[data_chunk_source] datasource - cdef unique_ptr[column] c_col - - cdef size_t c_byte_range_offset - cdef size_t c_byte_range_size - cdef uint64_t c_compression_begin_offset - cdef uint64_t c_compression_end_offset - cdef parse_options c_options - if compression is None: if isinstance(filepaths_or_buffers, TextIOBase): - datasource = move(make_source( - filepaths_or_buffers.read().encode())) + datasource = plc.io.text.make_source(filepaths_or_buffers.read()) else: - datasource = move(make_source_from_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_file(filepaths_or_buffers) elif compression == "bgzip": if isinstance(filepaths_or_buffers, TextIOBase): raise ValueError("bgzip compression requires a file path") @@ -60,30 +34,20 @@ def read_text(object filepaths_or_buffers, if len(compression_offsets) != 2: raise ValueError( "compression offsets need to consist of two elements") - c_compression_begin_offset = compression_offsets[0] - c_compression_end_offset = compression_offsets[1] - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode(), - c_compression_begin_offset, - c_compression_end_offset)) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + compression_offsets[0], + compression_offsets[1] + ) else: - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + ) else: raise ValueError("Only bgzip compression is supported at the moment") - c_options = parse_options() - if byte_range is not None: - c_byte_range_offset = byte_range[0] - c_byte_range_size = byte_range[1] - c_options.byte_range = byte_range_info( - c_byte_range_offset, - c_byte_range_size) - c_options.strip_delimiters = strip_delimiters - with nogil: - c_col = move(multibyte_split( - dereference(datasource), - delim, - c_options)) - - return Column.from_unique_ptr(move(c_col)) + options = plc.io.text.ParseOptions( + byte_range=byte_range, strip_delimiters=strip_delimiters + ) + plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 40d0c9eac3a..a163bb07888 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -3,24 +3,13 @@ from numba.np import numpy_support import cudf -from cudf.core._internals.expressions import parse_expression from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.utils import cudautils -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -cimport pylibcudf.libcudf.transform as libcudf_transform from pylibcudf cimport transform as plc_transform -from pylibcudf.expressions cimport Expression -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.expressions cimport expression -from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.utils cimport table_view_from_columns import pylibcudf as plc @@ -102,7 +91,7 @@ def one_hot_encode(Column input_column, Column categories): @acquire_spill_lock() -def compute_column(list columns, tuple column_names, expr: str): +def compute_column(list columns, tuple column_names, str expr): """Compute a new column by evaluating an expression on a set of columns. Parameters @@ -117,17 +106,8 @@ def compute_column(list columns, tuple column_names, expr: str): expr : str The expression to evaluate. """ - visitor = parse_expression(expr, column_names) - - # At the end, all the stack contains is the expression to evaluate. - cdef Expression cudf_expr = visitor.expression - cdef table_view tbl = table_view_from_columns(columns) - cdef unique_ptr[column] col - with nogil: - col = move( - libcudf_transform.compute_column( - tbl, - dereference(cudf_expr.c_obj.get()) - ) - ) - return Column.from_unique_ptr(move(col)) + result = plc_transform.compute_column( + plc.Table([col.to_pylibcudf(mode="read") for col in columns]), + plc.expressions.to_expression(expr, column_names), + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index 4fd3d31841e..c2b760490c1 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -7,12 +7,7 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -ctypedef bool underlying_type_t_order -ctypedef bool underlying_type_t_null_order -ctypedef bool underlying_type_t_sorted -ctypedef int32_t underlying_type_t_interpolation ctypedef int32_t underlying_type_t_type_id -ctypedef bool underlying_type_t_null_policy cdef dtype_from_column_view(column_view cv) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 861bb063707..f169ea12b10 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -11,12 +11,6 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_order, - underlying_type_t_sorted, -) - import pylibcudf import cudf @@ -151,44 +145,6 @@ datetime_unit_map = { size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID] -class Interpolation(IntEnum): - LINEAR = ( - libcudf_types.interpolation.LINEAR - ) - LOWER = ( - libcudf_types.interpolation.LOWER - ) - HIGHER = ( - libcudf_types.interpolation.HIGHER - ) - MIDPOINT = ( - libcudf_types.interpolation.MIDPOINT - ) - NEAREST = ( - libcudf_types.interpolation.NEAREST - ) - - -class Order(IntEnum): - ASCENDING = libcudf_types.order.ASCENDING - DESCENDING = libcudf_types.order.DESCENDING - - -class Sorted(IntEnum): - YES = libcudf_types.sorted.YES - NO = libcudf_types.sorted.NO - - -class NullOrder(IntEnum): - BEFORE = libcudf_types.null_order.BEFORE - AFTER = libcudf_types.null_order.AFTER - - -class NullHandling(IntEnum): - INCLUDE = libcudf_types.null_policy.INCLUDE - EXCLUDE = libcudf_types.null_policy.EXCLUDE - - cdef dtype_from_lists_column_view(column_view cv): # lists_column_view have no default constructor, so we heap # allocate it to get around Cython's limitation of requiring diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx deleted file mode 100644 index d5602fd5a1c..00000000000 --- a/python/cudf/cudf/_lib/unary.pyx +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.column cimport Column -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import numpy as np - -import pylibcudf - -from cudf.api.types import is_decimal_dtype -from cudf.core.buffer import acquire_spill_lock - - -@acquire_spill_lock() -def unary_operation(Column input, object op): - return Column.from_pylibcudf( - pylibcudf.unary.unary_operation(input.to_pylibcudf(mode="read"), op) - ) - - -@acquire_spill_lock() -def is_null(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_null(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def is_valid(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_valid(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def cast(Column input, object dtype=np.float64): - result = Column.from_pylibcudf( - pylibcudf.unary.cast( - input.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(dtype) - ) - ) - - if is_decimal_dtype(result.dtype): - result.dtype.precision = dtype.precision - return result - - -@acquire_spill_lock() -def is_nan(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_nan(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def is_non_nan(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_not_nan(input.to_pylibcudf(mode="read")) - ) diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index 7254db5c43d..6db3036d514 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -10,13 +10,13 @@ from pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) -cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) -cdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) +cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*) +cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * cdef table_view table_view_from_table(tbl, ignore_index=*) except* cdef columns_from_unique_ptr(unique_ptr[table] c_tbl) cdef columns_from_table_view(table_view tv, object owners) -cdef columns_from_pylibcudf_table(tbl) +cpdef columns_from_pylibcudf_table(tbl) cdef _data_from_columns(columns, column_names, index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 9e5b99f64eb..244d7fdc006 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -229,7 +229,7 @@ cdef columns_from_unique_ptr( return columns -cdef columns_from_pylibcudf_table(tbl): +cpdef columns_from_pylibcudf_table(tbl): """Convert a pylibcudf table into list of columns. Parameters @@ -309,14 +309,14 @@ cdef data_from_unique_ptr( ) -cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): +cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None): return _data_from_columns( columns_from_pylibcudf_table(tbl), column_names, index_names ) -cdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None): +cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None): """ Unpacks the TableWithMetadata from libcudf I/O into a dict of columns and an Index (cuDF format) diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 6e8ad556b08..3b13cc258ab 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -1,8 +1,8 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. import sys -from collections.abc import Callable -from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union +from collections.abc import Callable, Iterable +from typing import TYPE_CHECKING, Any, TypeVar, Union import numpy as np from pandas import Period, Timedelta, Timestamp @@ -42,7 +42,7 @@ SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"] # Groupby aggregation -AggType = Union[str, Callable] -MultiColumnAggType = Union[ - AggType, Iterable[AggType], Dict[Any, Iterable[AggType]] +AggType = Union[str, Callable] # noqa: UP007 +MultiColumnAggType = Union[ # noqa: UP007 + AggType, Iterable[AggType], dict[Any, Iterable[AggType]] ] diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index a6abd63d042..950ce5f1236 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2,7 +2,6 @@ from __future__ import annotations -import pickle import warnings from functools import cached_property from typing import TYPE_CHECKING, Any, Literal @@ -330,13 +329,6 @@ def get_level_values(self, level): else: raise KeyError(f"Requested level with name {level} " "not found") - @classmethod - def deserialize(cls, header, frames): - # Dispatch deserialization to the appropriate index type in case - # deserialization is ever attempted with the base class directly. - idx_type = pickle.loads(header["type-serialized"]) - return idx_type.deserialize(header, frames) - @property def names(self): """ diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e2bdecbe67a..7dba0dc8a70 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -3,10 +3,11 @@ import pandas as pd from packaging import version -PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2") +PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.3") PANDAS_VERSION = version.parse(pd.__version__) PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0") +PANDAS_GT_214 = PANDAS_VERSION > version.parse("2.1.4") PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0") PANDAS_LT_300 = PANDAS_VERSION < version.parse("3.0.0") diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py deleted file mode 100644 index 90d9118027a..00000000000 --- a/python/cudf/cudf/core/_internals/expressions.py +++ /dev/null @@ -1,229 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import ast -import functools - -import pyarrow as pa - -import pylibcudf as plc -from pylibcudf.expressions import ( - ASTOperator, - ColumnReference, - Expression, - Literal, - Operation, -) - -# This dictionary encodes the mapping from Python AST operators to their cudf -# counterparts. -python_cudf_operator_map = { - # Binary operators - ast.Add: ASTOperator.ADD, - ast.Sub: ASTOperator.SUB, - ast.Mult: ASTOperator.MUL, - ast.Div: ASTOperator.DIV, - ast.FloorDiv: ASTOperator.FLOOR_DIV, - ast.Mod: ASTOperator.PYMOD, - ast.Pow: ASTOperator.POW, - ast.Eq: ASTOperator.EQUAL, - ast.NotEq: ASTOperator.NOT_EQUAL, - ast.Lt: ASTOperator.LESS, - ast.Gt: ASTOperator.GREATER, - ast.LtE: ASTOperator.LESS_EQUAL, - ast.GtE: ASTOperator.GREATER_EQUAL, - ast.BitXor: ASTOperator.BITWISE_XOR, - # TODO: The mapping of logical/bitwise operators here is inconsistent with - # pandas. In pandas, Both `BitAnd` and `And` map to - # `ASTOperator.LOGICAL_AND` for booleans, while they map to - # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to - # encode this at present because expressions can be arbitrarily nested so - # we won't know the dtype of the input without inserting a much more - # complex traversal of the expression tree to determine the output types at - # each node. For now, we'll rely on users to use the appropriate operator. - ast.BitAnd: ASTOperator.BITWISE_AND, - ast.BitOr: ASTOperator.BITWISE_OR, - ast.And: ASTOperator.LOGICAL_AND, - ast.Or: ASTOperator.LOGICAL_OR, - # Unary operators - ast.Invert: ASTOperator.BIT_INVERT, - ast.Not: ASTOperator.NOT, - # TODO: Missing USub, possibility other unary ops? -} - - -# Mapping between Python function names encode in an ast.Call node and the -# corresponding libcudf C++ AST operators. -python_cudf_function_map = { - # TODO: Operators listed on - # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval # noqa: E501 - # that we don't support yet: - # expm1, log1p, arctan2 and log10. - "isnull": ASTOperator.IS_NULL, - "isna": ASTOperator.IS_NULL, - "sin": ASTOperator.SIN, - "cos": ASTOperator.COS, - "tan": ASTOperator.TAN, - "arcsin": ASTOperator.ARCSIN, - "arccos": ASTOperator.ARCCOS, - "arctan": ASTOperator.ARCTAN, - "sinh": ASTOperator.SINH, - "cosh": ASTOperator.COSH, - "tanh": ASTOperator.TANH, - "arcsinh": ASTOperator.ARCSINH, - "arccosh": ASTOperator.ARCCOSH, - "arctanh": ASTOperator.ARCTANH, - "exp": ASTOperator.EXP, - "log": ASTOperator.LOG, - "sqrt": ASTOperator.SQRT, - "abs": ASTOperator.ABS, - "ceil": ASTOperator.CEIL, - "floor": ASTOperator.FLOOR, - # TODO: Operators supported by libcudf with no Python function analog. - # ast.rint: ASTOperator.RINT, - # ast.cbrt: ASTOperator.CBRT, -} - - -class libcudfASTVisitor(ast.NodeVisitor): - """A NodeVisitor specialized for constructing a libcudf expression tree. - - This visitor is designed to handle AST nodes that have libcudf equivalents. - It constructs column references from names and literals from constants, - then builds up operations. The final result can be accessed using the - `expression` property. The visitor must be kept in scope for as long as the - expression is needed because all of the underlying libcudf expressions will - be destroyed when the libcudfASTVisitor is. - - Parameters - ---------- - col_names : Tuple[str] - The column names used to map the names in an expression. - """ - - def __init__(self, col_names: tuple[str]): - self.stack: list[Expression] = [] - self.nodes: list[Expression] = [] - self.col_names = col_names - - @property - def expression(self): - """Expression: The result of parsing an AST.""" - assert len(self.stack) == 1 - return self.stack[-1] - - def visit_Name(self, node): - try: - col_id = self.col_names.index(node.id) - except ValueError: - raise ValueError(f"Unknown column name {node.id}") - self.stack.append(ColumnReference(col_id)) - - def visit_Constant(self, node): - if not isinstance(node.value, (float, int, str, complex)): - raise ValueError( - f"Unsupported literal {repr(node.value)} of type " - "{type(node.value).__name__}" - ) - self.stack.append( - Literal(plc.interop.from_arrow(pa.scalar(node.value))) - ) - - def visit_UnaryOp(self, node): - self.visit(node.operand) - self.nodes.append(self.stack.pop()) - if isinstance(node.op, ast.USub): - # TODO: Except for leaf nodes, we won't know the type of the - # operand, so there's no way to know whether this should be a float - # or an int. We should maybe see what Spark does, and this will - # probably require casting. - self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1)))) - op = ASTOperator.MUL - self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) - elif isinstance(node.op, ast.UAdd): - self.stack.append(self.nodes[-1]) - else: - op = python_cudf_operator_map[type(node.op)] - self.stack.append(Operation(op, self.nodes[-1])) - - def visit_BinOp(self, node): - self.visit(node.left) - self.visit(node.right) - self.nodes.append(self.stack.pop()) - self.nodes.append(self.stack.pop()) - - op = python_cudf_operator_map[type(node.op)] - self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) - - def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops): - # Helper function handling the common components of parsing BoolOp and - # Compare AST nodes. These two types of nodes both support chaining - # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this - # function helps standardize that. - - # TODO: Whether And/Or and BitAnd/BitOr actually correspond to - # logical or bitwise operators depends on the data types that they - # are applied to. We'll need to add logic to map to that. - inner_ops = [] - for op, (left, right) in zip(operators, operands): - # Note that this will lead to duplicate nodes, e.g. if - # the comparison is `a < b < c` that will be encoded as - # `a < b and b < c`. We could potentially optimize by caching - # expressions by name so that we only construct them once. - self.visit(left) - self.visit(right) - - self.nodes.append(self.stack.pop()) - self.nodes.append(self.stack.pop()) - - op = python_cudf_operator_map[type(op)] - inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2])) - - self.nodes.extend(inner_ops) - - # If we have more than one comparator, we need to link them - # together with LOGICAL_AND operators. - if has_multiple_ops: - op = ASTOperator.LOGICAL_AND - - def _combine_compare_ops(left, right): - self.nodes.append(Operation(op, left, right)) - return self.nodes[-1] - - functools.reduce(_combine_compare_ops, inner_ops) - - self.stack.append(self.nodes[-1]) - - def visit_BoolOp(self, node): - operators = [node.op] * (len(node.values) - 1) - operands = zip(node.values[:-1], node.values[1:]) - self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2) - - def visit_Compare(self, node): - operands = (node.left, *node.comparators) - has_multiple_ops = len(operands) > 2 - operands = zip(operands[:-1], operands[1:]) - self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops) - - def visit_Call(self, node): - try: - op = python_cudf_function_map[node.func.id] - except KeyError: - raise ValueError(f"Unsupported function {node.func}.") - # Assuming only unary functions are supported, which is checked above. - if len(node.args) != 1 or node.keywords: - raise ValueError( - f"Function {node.func} only accepts one positional " - "argument." - ) - self.visit(node.args[0]) - - self.nodes.append(self.stack.pop()) - self.stack.append(Operation(op, self.nodes[-1])) - - -@functools.lru_cache(256) -def parse_expression(expr: str, col_names: tuple[str]): - visitor = libcudfASTVisitor(col_names) - visitor.visit(ast.parse(expr)) - return visitor diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py new file mode 100644 index 00000000000..3b8e3db60a7 --- /dev/null +++ b/python/cudf/cudf/core/_internals/unary.py @@ -0,0 +1,64 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pylibcudf as plc + +from cudf._lib.types import dtype_to_pylibcudf_type +from cudf.api.types import is_decimal_dtype +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from cudf._typing import Dtype + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def unary_operation( + col: ColumnBase, op: plc.unary.UnaryOperator +) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.unary_operation(col.to_pylibcudf(mode="read"), op) + ) + + +@acquire_spill_lock() +def is_null(col: ColumnBase) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.is_null(col.to_pylibcudf(mode="read")) + ) + + +@acquire_spill_lock() +def is_valid(col: ColumnBase) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.is_valid(col.to_pylibcudf(mode="read")) + ) + + +@acquire_spill_lock() +def cast(col: ColumnBase, dtype: Dtype) -> ColumnBase: + result = type(col).from_pylibcudf( + plc.unary.cast( + col.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype) + ) + ) + + if is_decimal_dtype(result.dtype): + result.dtype.precision = dtype.precision # type: ignore[union-attr] + return result + + +@acquire_spill_lock() +def is_nan(col: ColumnBase) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.is_nan(col.to_pylibcudf(mode="read")) + ) + + +@acquire_spill_lock() +def is_non_nan(col: ColumnBase) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.is_not_nan(col.to_pylibcudf(mode="read")) + ) diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index ce6bb83bc77..c8ea03b04fe 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,8 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" -import pickle - import numpy import cudf @@ -22,6 +20,14 @@ class Serializable: latter converts back from that representation into an equivalent object. """ + # A mapping from class names to the classes themselves. This is used to + # reconstruct the correct class when deserializing an object. + _name_type_map: dict = {} + + def __init_subclass__(cls, /, **kwargs): + super().__init_subclass__(**kwargs) + cls._name_type_map[cls.__name__] = cls + def serialize(self): """Generate an equivalent serializable representation of an object. @@ -98,7 +104,7 @@ def device_serialize(self): ) for f in frames ) - header["type-serialized"] = pickle.dumps(type(self)) + header["type-serialized-name"] = type(self).__name__ header["is-cuda"] = [ hasattr(f, "__cuda_array_interface__") for f in frames ] @@ -128,10 +134,10 @@ def device_deserialize(cls, header, frames): :meta private: """ - typ = pickle.loads(header["type-serialized"]) + typ = cls._name_type_map[header["type-serialized-name"]] frames = [ cudf.core.buffer.as_buffer(f) if c else memoryview(f) - for c, f in zip(header["is-cuda"], frames) + for c, f in zip(header["is-cuda"], frames, strict=True) ] return typ.deserialize(header, frames) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 32ae8c5ee53..625938ca168 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -3,10 +3,9 @@ from __future__ import annotations import math -import pickle import weakref from types import SimpleNamespace -from typing import Any, Literal, Mapping +from typing import TYPE_CHECKING, Any, Literal import numpy from typing_extensions import Self @@ -18,6 +17,9 @@ from cudf.core.abc import Serializable from cudf.utils.string import format_bytes +if TYPE_CHECKING: + from collections.abc import Mapping + def host_memory_allocation(nbytes: int) -> memoryview: """Allocate host memory using NumPy @@ -284,7 +286,7 @@ def memoryview( """Read-only access to the buffer through host memory.""" size = self._size if size is None else size host_buf = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self.get_ptr(mode="read") + offset, host_buf ) return memoryview(host_buf).toreadonly() @@ -429,8 +431,7 @@ def serialize(self) -> tuple[dict, list]: second element is a list containing single frame. """ header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["owner-type-serialized-name"] = type(self._owner).__name__ header["frame_count"] = 1 frames = [self] return header, frames @@ -457,7 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self: if isinstance(frame, cls): return frame # The frame is already deserialized - owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"]) + owner_type: BufferOwner = Serializable._name_type_map[ + header["owner-type-serialized-name"] + ] if hasattr(frame, "__cuda_array_interface__"): owner = owner_type.from_device_memory(frame, exposed=False) else: diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py index 0bd8d6054b3..ecf9807cfc2 100644 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py @@ -2,13 +2,16 @@ from __future__ import annotations -from typing import Literal, Mapping +from typing import TYPE_CHECKING, Literal from typing_extensions import Self import cudf from cudf.core.buffer.buffer import Buffer, BufferOwner +if TYPE_CHECKING: + from collections.abc import Mapping + class ExposureTrackedBuffer(Buffer): """An exposure tracked buffer. diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 4c9e524ee05..66f8be4ddc5 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -3,7 +3,6 @@ from __future__ import annotations import collections.abc -import pickle import time import weakref from threading import RLock @@ -207,7 +206,7 @@ def spill(self, target: str = "cpu") -> None: domain="cudf_python-spill", ): host_mem = host_memory_allocation(self.size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self._ptr, host_mem ) self._ptr_desc["memoryview"] = host_mem @@ -352,7 +351,7 @@ def memoryview( else: assert self._ptr_desc["type"] == "gpu" ret = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self._ptr + offset, ret ) return ret @@ -415,8 +414,7 @@ def serialize(self) -> tuple[dict, list]: header: dict[str, Any] = {} frames: list[Buffer | memoryview] with self._owner.lock: - header["type-serialized"] = pickle.dumps(self.__class__) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["owner-type-serialized-name"] = type(self._owner).__name__ header["frame_count"] = 1 if self.is_spilled: frames = [self.memoryview()] diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 6ca64a0a2be..8d38a5f2272 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -2,9 +2,10 @@ from __future__ import annotations +import pylibcudf as plc + import cudf from cudf._lib.nvtext.byte_pair_encode import ( - BPEMergePairs as cpp_merge_pairs, byte_pair_encoding as cpp_byte_pair_encoding, ) @@ -25,7 +26,9 @@ class BytePairEncoder: """ def __init__(self, merges_pair: "cudf.Series"): - self.merge_pairs = cpp_merge_pairs(merges_pair._column) + self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs( + merges_pair._column.to_pylibcudf(mode="read") + ) def __call__(self, text, separator: str = " ") -> cudf.Series: """ diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 06791df7dc0..a1e87d04bc9 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -29,4 +29,3 @@ Decimal128Column, DecimalBaseColumn, ) -from cudf.core.column.interval import IntervalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de5ed15771d..7354b917f90 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -4,7 +4,7 @@ import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd @@ -14,6 +14,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask +from cudf.core._internals import unary from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype @@ -26,6 +27,7 @@ if TYPE_CHECKING: from collections import abc + from collections.abc import Mapping, Sequence import numba.cuda @@ -1017,12 +1019,12 @@ def isnull(self) -> ColumnBase: """ Identify missing values in a CategoricalColumn. """ - result = libcudf.unary.is_null(self) + result = unary.is_null(self) if self.categories.dtype.kind == "f": # Need to consider `np.nan` values in case # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) + categories = unary.is_nan(self.categories) if categories.any(): code = self._encode(np.nan) result = result | (self.codes == cudf.Scalar(code)) @@ -1033,12 +1035,12 @@ def notnull(self) -> ColumnBase: """ Identify non-missing values in a CategoricalColumn. """ - result = libcudf.unary.is_valid(self) + result = unary.is_valid(self) if self.categories.dtype.kind == "f": # Need to consider `np.nan` values in case # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) + categories = unary.is_nan(self.categories) if categories.any(): code = self._encode(np.nan) result = result & (self.codes != cudf.Scalar(code)) @@ -1202,9 +1204,7 @@ def _concat( elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype, masked=True) else: - # Filter out inputs that have 0 length, then concatenate. - codes = [o for o in codes if len(o)] - codes_col = libcudf.concat.concat_columns(objs) + codes_col = column.concat_columns(codes) # type: ignore[arg-type] codes_col = as_unsigned_codes( len(cats), @@ -1337,7 +1337,7 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): - new_cats = cudf.Series(new_cats)._column.unique() + new_cats = new_cats.unique() if cur_cats.equals(new_cats, check_dtypes=True): # TODO: Internal usages don't always need a copy; add a copy keyword diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7674565e2c3..4b1e9c1129e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,12 +2,12 @@ from __future__ import annotations -import pickle from collections import abc +from collections.abc import MutableSequence, Sequence from functools import cached_property from itertools import chain from types import SimpleNamespace -from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numpy as np @@ -18,6 +18,7 @@ from pandas.core.arrays.arrow.extension_types import ArrowIntervalType from typing_extensions import Self +import pylibcudf as plc import rmm import cudf @@ -46,6 +47,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 +from cudf.core._internals import unary from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -579,8 +581,8 @@ def _wrap_binop_normalization(self, other): if cudf.utils.utils.is_na_like(other): return cudf.Scalar(other, dtype=self.dtype) if isinstance(other, np.ndarray) and other.ndim == 0: - # Try and maintain the dtype - other = other.dtype.type(other.item()) + # Return numpy scalar + other = other[()] return self.normalize_binop_value(other) def _scatter_by_slice( @@ -712,12 +714,12 @@ def isnull(self) -> ColumnBase: if not self.has_nulls(include_nan=self.dtype.kind == "f"): return as_column(False, length=len(self)) - result = libcudf.unary.is_null(self) + result = unary.is_null(self) if self.dtype.kind == "f": # Need to consider `np.nan` values in case # of a float column - result = result | libcudf.unary.is_nan(self) + result = result | unary.is_nan(self) return result @@ -726,12 +728,12 @@ def notnull(self) -> ColumnBase: if not self.has_nulls(include_nan=self.dtype.kind == "f"): return as_column(True, length=len(self)) - result = libcudf.unary.is_valid(self) + result = unary.is_valid(self) if self.dtype.kind == "f": # Need to consider `np.nan` values in case # of a float column - result = result & libcudf.unary.is_non_nan(self) + result = result & unary.is_non_nan(self) return result @@ -1221,28 +1223,27 @@ def serialize(self) -> tuple[dict, list]: header: dict[Any, Any] = {} frames = [] - header["type-serialized"] = pickle.dumps(type(self)) try: - dtype, dtype_frames = self.dtype.serialize() + dtype, dtype_frames = self.dtype.device_serialize() header["dtype"] = dtype frames.extend(dtype_frames) header["dtype-is-cudf-serialized"] = True except AttributeError: - header["dtype"] = pickle.dumps(self.dtype) + header["dtype"] = self.dtype.str header["dtype-is-cudf-serialized"] = False if self.data is not None: - data_header, data_frames = self.data.serialize() + data_header, data_frames = self.data.device_serialize() header["data"] = data_header frames.extend(data_frames) if self.mask is not None: - mask_header, mask_frames = self.mask.serialize() + mask_header, mask_frames = self.mask.device_serialize() header["mask"] = mask_header frames.extend(mask_frames) if self.children: child_headers, child_frames = zip( - *(c.serialize() for c in self.children) + *(c.device_serialize() for c in self.children) ) header["subheaders"] = list(child_headers) frames.extend(chain(*child_frames)) @@ -1254,8 +1255,7 @@ def serialize(self) -> tuple[dict, list]: def deserialize(cls, header: dict, frames: list) -> ColumnBase: def unpack(header, frames) -> tuple[Any, list]: count = header["frame_count"] - klass = pickle.loads(header["type-serialized"]) - obj = klass.deserialize(header, frames[:count]) + obj = cls.device_deserialize(header, frames[:count]) return obj, frames[count:] assert header["frame_count"] == len(frames), ( @@ -1265,7 +1265,7 @@ def unpack(header, frames) -> tuple[Any, list]: if header["dtype-is-cudf-serialized"]: dtype, frames = unpack(header["dtype"], frames) else: - dtype = pickle.loads(header["dtype"]) + dtype = np.dtype(header["dtype"]) if "data" in header: data, frames = unpack(header["data"], frames) else: @@ -2216,7 +2216,9 @@ def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: frames = [] if len(columns) > 0: - header_columns = [c.serialize() for c in columns] + header_columns: list[tuple[dict, list]] = [ + c.device_serialize() for c in columns + ] headers, column_frames = zip(*header_columns) for f in column_frames: frames.extend(f) @@ -2233,7 +2235,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]: for meta in headers: col_frame_count = meta["frame_count"] - col_typ = pickle.loads(meta["type-serialized"]) + col_typ = Serializable._name_type_map[meta["type-serialized-name"]] colobj = col_typ.deserialize(meta, frames[:col_frame_count]) columns.append(colobj) # Advance frames @@ -2298,4 +2300,10 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: return column_empty(0, head.dtype, masked=True) # Filter out inputs that have 0 length, then concatenate. - return libcudf.concat.concat_columns([o for o in objs if len(o)]) + objs_with_len = [o for o in objs if len(o)] + with acquire_spill_lock(): + return Column.from_pylibcudf( + plc.concatenate.concatenate( + [col.to_pylibcudf(mode="read") for col in objs_with_len] + ) + ) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d0ea4612a1b..16124cf0a7d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,29 +8,36 @@ import locale import re from locale import nl_langinfo -from typing import TYPE_CHECKING, Literal, Sequence, cast +from typing import TYPE_CHECKING, Literal, cast import numpy as np import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf from cudf import _lib as libcudf -from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted from cudf.core._compat import PANDAS_GE_220 +from cudf.core._internals import unary from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, get_compatible_timezone, get_tz_data, ) -from cudf.core.buffer import Buffer +from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.dtypes import _get_base_dtype -from cudf.utils.utils import _all_bools_with_nulls +from cudf.utils.utils import ( + _all_bools_with_nulls, + _datetime_timedelta_find_and_replace, +) if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ( ColumnBinaryOperand, DatetimeLikeScalar, @@ -480,7 +487,12 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: if dtype == self.dtype: return self - return libcudf.unary.cast(self, dtype=dtype) + elif isinstance(dtype, pd.DatetimeTZDtype): + raise TypeError( + "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. " + "Use tz_localize instead." + ) + return unary.cast(self, dtype=dtype) # type: ignore[return-value] def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override] raise TypeError( @@ -623,6 +635,22 @@ def quantile( ) return result.astype(self.dtype) + def find_and_replace( + self, + to_replace: ColumnBase, + replacement: ColumnBase, + all_nan: bool = False, + ) -> DatetimeColumn: + return cast( + DatetimeColumn, + _datetime_timedelta_find_and_replace( + original_column=self, + to_replace=to_replace, + replacement=replacement, + all_nan=all_nan, + ), + ) + def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: reflect, op = self._check_reflected_op(op) other = self._wrap_binop_normalization(other) @@ -792,13 +820,16 @@ def _find_ambiguous_and_nonexistent( # The end of an ambiguous time period is what Clock 2 reads at # the moment of transition: ambiguous_end = clock_2.apply_boolean_mask(cond) - ambiguous = label_bins( - self, - left_edges=ambiguous_begin, - left_inclusive=True, - right_edges=ambiguous_end, - right_inclusive=False, - ).notnull() + with acquire_spill_lock(): + plc_column = plc.labeling.label_bins( + self.to_pylibcudf(mode="read"), + ambiguous_begin.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES, + ambiguous_end.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.NO, + ) + ambiguous = libcudf.column.Column.from_pylibcudf(plc_column) + ambiguous = ambiguous.notnull() # At the start of a non-existent time period, Clock 2 reads less # than Clock 1 (which has been turned forward): @@ -808,13 +839,16 @@ def _find_ambiguous_and_nonexistent( # The end of the non-existent time period is what Clock 1 reads # at the moment of transition: nonexistent_end = clock_1.apply_boolean_mask(cond) - nonexistent = label_bins( - self, - left_edges=nonexistent_begin, - left_inclusive=True, - right_edges=nonexistent_end, - right_inclusive=False, - ).notnull() + with acquire_spill_lock(): + plc_column = plc.labeling.label_bins( + self.to_pylibcudf(mode="read"), + nonexistent_begin.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES, + nonexistent_end.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.NO, + ) + nonexistent = libcudf.column.Column.from_pylibcudf(plc_column) + nonexistent = nonexistent.notnull() return ambiguous, nonexistent @@ -940,6 +974,16 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: def as_string_column(self) -> cudf.core.column.StringColumn: return self._local_time.as_string_column() + def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: + if isinstance(dtype, pd.DatetimeTZDtype) and dtype != self.dtype: + if dtype.unit != self.time_unit: + # TODO: Doesn't check that new unit is valid. + casted = self._with_type_metadata(dtype) + else: + casted = self + return casted.tz_convert(str(dtype.tz)) + return super().as_datetime_column(dtype) + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component( self._local_time, field diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 8803ebd6791..ce7aa91f775 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -4,7 +4,7 @@ import warnings from decimal import Decimal -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import cupy as cp import numpy as np @@ -16,6 +16,7 @@ from_decimal as cpp_from_decimal, ) from cudf.api.types import is_scalar +from cudf.core._internals import unary from cudf.core.buffer import as_buffer from cudf.core.column import ColumnBase from cudf.core.dtypes import ( @@ -84,7 +85,7 @@ def as_decimal_column( if dtype == self.dtype: return self - return libcudf.unary.cast(self, dtype) + return unary.cast(self, dtype) # type: ignore[return-value] def as_string_column(self) -> cudf.core.column.StringColumn: if len(self) > 0: @@ -215,23 +216,10 @@ def normalize_binop_value(self, other): ) return NotImplemented - def _decimal_quantile( - self, q: float | Sequence[float], interpolation: str, exact: bool - ) -> ColumnBase: - quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q - # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( - [self], [True], "first", stable=True - ).slice(self.null_count, len(self)) - result = libcudf.quantiles.quantile( - self, quant, interpolation, indices, exact - ) - return result._with_type_metadata(self.dtype) - def as_numerical_column( self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": - return libcudf.unary.cast(self, dtype) + return unary.cast(self, dtype) # type: ignore[return-value] class Decimal32Column(DecimalBaseColumn): diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c6a39199e3b..6b25e568f00 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -11,7 +11,6 @@ from typing_extensions import Self import cudf -from cudf._lib.copying import segmented_gather from cudf._lib.lists import ( concatenate_list_elements, concatenate_rows, @@ -22,6 +21,7 @@ extract_element_scalar, index_of_column, index_of_scalar, + segmented_gather, sort_lists, ) from cudf._lib.strings.convert.convert_lists import format_list_column @@ -34,6 +34,8 @@ from cudf.core.missing import NA if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 05a0ab2e09a..a91c080fe21 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,9 +2,7 @@ from __future__ import annotations -from typing import Union, overload - -from typing_extensions import Literal +from typing import Literal, Union, overload import cudf import cudf.core.column diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 78d2814ed26..36d1bdb45b6 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,7 @@ from __future__ import annotations import functools -from typing import TYPE_CHECKING, Any, Sequence, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd @@ -14,6 +14,7 @@ import cudf from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar +from cudf.core._internals import unary from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand @@ -28,7 +29,7 @@ from .numerical_base import NumericalBaseColumn if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Sequence from cudf._typing import ( ColumnBinaryOperand, @@ -125,7 +126,7 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: and self.dtype.kind in {"c", "f"} and np.isnan(value) ): - nan_col = libcudf.unary.is_nan(self) + nan_col = unary.is_nan(self) return nan_col.indices_of(True) else: return super().indices_of(value) @@ -184,7 +185,7 @@ def unary_operator(self, unaryop: str | Callable) -> ColumnBase: unaryop = unaryop.upper() unaryop = _unaryop_map.get(unaryop, unaryop) unaryop = pylibcudf.unary.UnaryOperator[unaryop] - return libcudf.unary.unary_operation(self, unaryop) + return unary.unary_operation(self, unaryop) def __invert__(self): if self.dtype.kind in "ui": @@ -388,13 +389,13 @@ def as_timedelta_column( def as_decimal_column( self, dtype: Dtype ) -> "cudf.core.column.DecimalBaseColumn": - return libcudf.unary.cast(self, dtype) + return unary.cast(self, dtype) # type: ignore[return-value] def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: dtype = cudf.dtype(dtype) if dtype == self.dtype: return self - return libcudf.unary.cast(self, dtype) + return unary.cast(self, dtype) # type: ignore[return-value] def all(self, skipna: bool = True) -> bool: # If all entries are null the result is True, including when the column @@ -421,7 +422,7 @@ def any(self, skipna: bool = True) -> bool: def nan_count(self) -> int: if self.dtype.kind != "f": return 0 - nan_col = libcudf.unary.is_nan(self) + nan_col = unary.is_nan(self) return nan_col.sum() def _process_values_for_isin( @@ -511,24 +512,41 @@ def find_and_replace( ): return self.copy() - to_replace_col = _normalize_find_and_replace_input( - self.dtype, to_replace - ) + try: + to_replace_col = _normalize_find_and_replace_input( + self.dtype, to_replace + ) + except TypeError: + # if `to_replace` cannot be normalized to the current dtype, + # that means no value of `to_replace` is present in self, + # Hence there is no point of proceeding further. + return self.copy() + if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: - replacement_col = _normalize_find_and_replace_input( - self.dtype, replacement - ) + try: + replacement_col = _normalize_find_and_replace_input( + self.dtype, replacement + ) + except TypeError: + # Some floating values can never be converted into signed or unsigned integers + # for those cases, we just need a column of `replacement` constructed + # with its own type for the final type determination below at `find_common_type` + # call. + replacement_col = column.as_column( + replacement, + dtype=self.dtype if len(replacement) <= 0 else None, + ) + common_type = find_common_type( + (to_replace_col.dtype, replacement_col.dtype, self.dtype) + ) if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( - replacement[0], length=len(to_replace_col), dtype=self.dtype + replacement[0], length=len(to_replace_col), dtype=common_type ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return self.copy() - common_type = find_common_type( - (to_replace_col.dtype, replacement_col.dtype, self.dtype) - ) replaced = self.astype(common_type) df = cudf.DataFrame._from_data( { @@ -718,6 +736,8 @@ def _normalize_find_and_replace_input( if isinstance(col_to_normalize, list): if normalized_column.null_count == len(normalized_column): normalized_column = normalized_column.astype(input_column_dtype) + if normalized_column.can_cast_safely(input_column_dtype): + return normalized_column.astype(input_column_dtype) col_to_normalize_dtype = min_column_type( normalized_column, input_column_dtype ) @@ -728,7 +748,7 @@ def _normalize_find_and_replace_input( if np.isinf(col_to_normalize[0]): return normalized_column col_to_normalize_casted = np.array(col_to_normalize[0]).astype( - input_column_dtype + col_to_normalize_dtype ) if not np.isnan(col_to_normalize_casted) and ( @@ -739,8 +759,8 @@ def _normalize_find_and_replace_input( f"{col_to_normalize[0]} " f"to {input_column_dtype.name}" ) - else: - col_to_normalize_dtype = input_column_dtype + if normalized_column.can_cast_safely(col_to_normalize_dtype): + return normalized_column.astype(col_to_normalize_dtype) elif hasattr(col_to_normalize, "dtype"): col_to_normalize_dtype = col_to_normalize.dtype else: @@ -755,6 +775,8 @@ def _normalize_find_and_replace_input( f"{col_to_normalize_dtype.name} " f"to {input_column_dtype.name}" ) + if not normalized_column.can_cast_safely(input_column_dtype): + return normalized_column return normalized_column.astype(input_column_dtype) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 3b8dd05c13a..6d639337401 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -7,9 +7,11 @@ import numpy as np +import pylibcudf as plc + import cudf from cudf import _lib as libcudf -from cudf.core.buffer import Buffer +from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase from cudf.core.missing import NA from cudf.core.mixins import Scannable @@ -145,9 +147,15 @@ def quantile( indices = libcudf.sort.order_by( [self], [True], "first", stable=True ).slice(self.null_count, len(self)) - result = libcudf.quantiles.quantile( - self, q, interpolation, indices, exact - ) + with acquire_spill_lock(): + plc_column = plc.quantiles.quantile( + self.to_pylibcudf(mode="read"), + q, + plc.types.Interpolation[interpolation.upper()], + indices.to_pylibcudf(mode="read"), + exact, + ) + result = type(self).from_pylibcudf(plc_column) # type: ignore[assignment] if return_scalar: scalar_result = result.element_indexing(0) if interpolation in {"lower", "higher", "nearest"}: @@ -180,9 +188,12 @@ def var( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "var", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def std( self, @@ -190,9 +201,12 @@ def std( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "std", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4463e3280df..3d70b01b7e4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,12 +5,14 @@ import re import warnings from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast, overload +from typing import TYPE_CHECKING, cast, overload import numpy as np import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf import cudf.api.types from cudf import _lib as libcudf @@ -33,6 +35,8 @@ def str_to_boolean(column: StringColumn): if TYPE_CHECKING: + from collections.abc import Sequence + import cupy import numba.cuda @@ -996,7 +1000,7 @@ def replace( return self._return_or_inplace( libstrings.replace_multi_re( self._column, - pat, + list(pat), column.as_column(repl, dtype="str"), ) if regex @@ -2383,8 +2387,7 @@ def get_json_object( 0 [\n { "category": "reference",\n ... dtype: object """ - - options = libstrings.GetJsonObjectOptions( + options = plc.json.GetJsonObjectOptions( allow_single_quotes=allow_single_quotes, strip_quotes_from_single_strings=( strip_quotes_from_single_strings @@ -2546,9 +2549,9 @@ def split( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.split_re(self._column, pat, n) + data = libstrings.split_re(self._column, pat, n) else: - data, _ = libstrings.split( + data = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2719,9 +2722,9 @@ def rsplit( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.rsplit_re(self._column, pat, n) + data = libstrings.rsplit_re(self._column, pat, n) else: - data, _ = libstrings.rsplit( + data = libstrings.rsplit( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2820,7 +2823,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.partition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2885,7 +2888,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2966,7 +2969,7 @@ def pad( raise TypeError(msg) try: - side = libstrings.SideType[side.upper()] + side = plc.strings.side_type.SideType[side.upper()] except KeyError: raise ValueError( "side has to be either one of {'left', 'right', 'both'}" @@ -3624,6 +3627,46 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) + def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: + """ + Find first occurrence of pattern or regular expression in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 (no flags) + Flags to pass through to the regex engine (e.g. re.MULTILINE) + + Returns + ------- + Series + A Series of position values where the pattern first matches + each string. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(['Lion', 'Monkey', 'Rabbit', 'Cat']) + >>> s.str.find_re('[ti]') + 0 1 + 1 -1 + 2 4 + 3 2 + dtype: int32 + """ + if isinstance(pat, re.Pattern): + flags = pat.flags & ~re.U + pat = pat.pattern + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "Unsupported value for `flags` parameter" + ) + + data = libstrings.find_re(self._column, pat, flags) + return self._return_or_inplace(data) + def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: """ Find all first occurrences of patterns in the Series/Index. @@ -5307,11 +5350,65 @@ def minhash( libstrings.minhash(self._column, seeds_column, width) ) + def minhash_permuted( + self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint32 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint32. + b : ColumnLike + Values for minhash calculation. + Must be of type uint32. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book']) + >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) + >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) + >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + 0 [1305480171, 462824409, 74608232] + 1 [32665388, 65330773, 97996158] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def minhash64( self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. This function generates 2 uint64 values but only the first uint64 value is used. @@ -5347,6 +5444,59 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def minhash64_permuted( + self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint64 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint64. + b : ColumnLike + Values for minhash calculation. + Must be of type uint64. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) + >>> a = cudf.Series([2, 3], dtype=np.uint64) + >>> b = cudf.Series([5, 6], dtype=np.uint64) + >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + 0 [172452388517576012, 316595762085180527] + 1 [71427536958126239, 58787297728258215] + 2 [423885828176437114, 1140588505926961370] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash64_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: """ Compute the minhash of a list column of strings. diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 2fda3b2c434..8f16ba4e15b 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -68,12 +68,7 @@ def base_size(self): return self.size + self.offset def to_arrow(self) -> pa.Array: - children = [ - pa.nulls(len(child)) - if len(child) == child.null_count - else child.to_arrow() - for child in self.children - ] + children = [child.to_arrow() for child in self.children] pa_type = pa.struct( { diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 6b6f3e517a8..620fe31c30f 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,7 +4,7 @@ import datetime import functools -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -13,12 +13,18 @@ import cudf from cudf import _lib as libcudf from cudf.api.types import is_scalar +from cudf.core._internals import unary from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype -from cudf.utils.utils import _all_bools_with_nulls +from cudf.utils.utils import ( + _all_bools_with_nulls, + _datetime_timedelta_find_and_replace, +) if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype _unit_to_nanoseconds_conversion = { @@ -93,7 +99,7 @@ def __init__( size = data.size // dtype.itemsize size = size - offset if len(children) != 0: - raise ValueError("TimedeltaColumn must have no children.") + raise ValueError("TimeDeltaColumn must have no children.") super().__init__( data=data, size=size, @@ -302,7 +308,53 @@ def as_string_column(self) -> cudf.core.column.StringColumn: def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn: if dtype == self.dtype: return self - return libcudf.unary.cast(self, dtype=dtype) + return unary.cast(self, dtype=dtype) # type: ignore[return-value] + + def find_and_replace( + self, + to_replace: ColumnBase, + replacement: ColumnBase, + all_nan: bool = False, + ) -> TimeDeltaColumn: + return cast( + TimeDeltaColumn, + _datetime_timedelta_find_and_replace( + original_column=self, + to_replace=to_replace, + replacement=replacement, + all_nan=all_nan, + ), + ) + + def can_cast_safely(self, to_dtype: Dtype) -> bool: + if to_dtype.kind == "m": # type: ignore[union-attr] + to_res, _ = np.datetime_data(to_dtype) + self_res, _ = np.datetime_data(self.dtype) + + max_int = np.iinfo(np.int64).max + + max_dist = np.timedelta64( + self.max().astype(np.int64, copy=False), self_res + ) + min_dist = np.timedelta64( + self.min().astype(np.int64, copy=False), self_res + ) + + self_delta_dtype = np.timedelta64(0, self_res).dtype + + if max_dist <= np.timedelta64(max_int, to_res).astype( + self_delta_dtype + ) and min_dist <= np.timedelta64(max_int, to_res).astype( + self_delta_dtype + ): + return True + else: + return False + elif to_dtype == cudf.dtype("int64") or to_dtype == cudf.dtype("O"): + # can safely cast to representation, or string + return True + else: + return False def mean(self, skipna=None) -> pd.Timedelta: return pd.Timedelta( diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index bc093fdaa9a..496e86ed709 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -5,8 +5,9 @@ import itertools import sys from collections import abc +from collections.abc import Mapping from functools import cached_property, reduce -from typing import TYPE_CHECKING, Any, Mapping, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index c9b1fa2669c..a4d12cfc7f0 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -6,8 +6,12 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf +from cudf._lib.column import Column from cudf.api.types import is_list_like +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import as_column from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.core.index import IntervalIndex, interval_range @@ -256,9 +260,19 @@ def cut( # the input arr must be changed to the same type as the edges input_arr = input_arr.astype(left_edges.dtype) # get the indexes for the appropriate number - index_labels = cudf._lib.labeling.label_bins( - input_arr, left_edges, left_inclusive, right_edges, right_inclusive - ) + with acquire_spill_lock(): + plc_column = plc.labeling.label_bins( + input_arr.to_pylibcudf(mode="read"), + left_edges.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES + if left_inclusive + else plc.labeling.Inclusive.NO, + right_edges.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES + if right_inclusive + else plc.labeling.Inclusive.NO, + ) + index_labels = Column.from_pylibcudf(plc_column) if labels is False: # if labels is false we return the index labels, we return them @@ -283,7 +297,7 @@ def cut( # should allow duplicate categories. return interval_labels[index_labels] - index_labels = as_unsigned_codes(len(interval_labels), index_labels) + index_labels = as_unsigned_codes(len(interval_labels), index_labels) # type: ignore[arg-type] col = CategoricalColumn( data=None, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 16b0aa95c35..fd68a40324e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7,14 +7,13 @@ import itertools import numbers import os -import pickle import re import sys import textwrap import warnings from collections import abc, defaultdict -from collections.abc import Callable, Iterator -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast +from collections.abc import Callable, Iterator, MutableMapping +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numba @@ -26,6 +25,8 @@ from pandas.io.formats.printing import pprint_thing from typing_extensions import Self, assert_never +import pylibcudf as plc + import cudf import cudf.core.common from cudf import _lib as libcudf @@ -42,7 +43,7 @@ ) from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -579,7 +580,7 @@ class _DataFrameiAtIndexer(_DataFrameIlocIndexer): pass -class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): +class DataFrame(IndexedFrame, GetAttrGetItemMixin): """ A GPU Dataframe object. @@ -781,9 +782,15 @@ def __init__( ) elif isinstance(data, ColumnAccessor): raise TypeError( - "Use cudf.Series._from_data for constructing a Series from " + "Use cudf.DataFrame._from_data for constructing a DataFrame from " "ColumnAccessor" ) + elif isinstance(data, ColumnBase): + raise TypeError( + "Use cudf.DataFrame._from_arrays for constructing a DataFrame from " + "ColumnBase or Use cudf.DataFrame._from_data by passing a dict " + "of column name and column as key-value pair." + ) elif hasattr(data, "__cuda_array_interface__"): arr_interface = data.__cuda_array_interface__ # descr is an optional field of the _cuda_ary_iface_ @@ -1175,7 +1182,7 @@ def _constructor_expanddim(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self.index.serialize() + header["index"], index_frames = self.index.device_serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -1190,8 +1197,7 @@ def deserialize(cls, header, frames): header, frames[header["index_frame_count"] :] ) - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + index = cls.device_deserialize(header["index"], frames[:index_nframes]) obj.index = index return obj @@ -1778,11 +1784,32 @@ def _concat( ) # Concatenate the Tables - out = cls._from_data( - *libcudf.concat.concat_tables( - tables, ignore_index=ignore_index or are_all_range_index + ignore = ignore_index or are_all_range_index + index_names = None if ignore else tables[0]._index_names + column_names = tables[0]._column_names + with acquire_spill_lock(): + plc_tables = [ + plc.Table( + [ + c.to_pylibcudf(mode="read") + for c in ( + table._columns + if ignore + else itertools.chain( + table._index._columns, table._columns + ) + ) + ] + ) + for table in tables + ] + + concatted = libcudf.utils.data_from_pylibcudf_table( + plc.concatenate.concatenate(plc_tables), + column_names=column_names, + index_names=index_names, ) - ) + out = cls._from_data(*concatted) # If ignore_index is True, all input frames are empty, and at # least one input frame has an index, assign a new RangeIndex @@ -4956,7 +4983,9 @@ def apply_chunks( ) @_performance_tracking - def partition_by_hash(self, columns, nparts, keep_index=True): + def partition_by_hash( + self, columns, nparts: int, keep_index: bool = True + ) -> list[DataFrame]: """Partition the dataframe by the hashed value of data in *columns*. Parameters @@ -4980,13 +5009,21 @@ def partition_by_hash(self, columns, nparts, keep_index=True): else: cols = [*self._columns] - output_columns, offsets = libcudf.hash.hash_partition( - cols, key_indices, nparts - ) + with acquire_spill_lock(): + plc_table, offsets = plc.partitioning.hash_partition( + plc.Table([col.to_pylibcudf(mode="read") for col in cols]), + key_indices, + nparts, + ) + output_columns = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_table.columns() + ] + outdf = self._from_columns_like_self( output_columns, self._column_names, - self._index_names if keep_index else None, + self._index_names if keep_index else None, # type: ignore[arg-type] ) # Slice into partitions. Notice, `hash_partition` returns the start # offset of each partition thus we skip the first offset @@ -5118,11 +5155,12 @@ def info( useful for big DataFrames and fine-tune memory optimization: >>> import numpy as np - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> rng = np.random.default_rng(seed=0) + >>> random_strings_array = rng.choice(['a', 'b', 'c'], 10 ** 6) >>> df = cudf.DataFrame({ - ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... 'column_1': rng.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': rng.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': rng.choice(['a', 'b', 'c'], 10 ** 6) ... }) >>> df.info(memory_usage='deep') @@ -5883,7 +5921,7 @@ def _from_arrays( f"records dimension expected 1 or 2 but found: {array_data.ndim}" ) - if data.ndim == 2: + if array_data.ndim == 2: num_cols = array_data.shape[1] else: # Since we validate ndim to be either 1 or 2 above, @@ -6287,14 +6325,17 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): ) if not skipna and any(col.nullable for col in filtered._columns): - mask = DataFrame( + length = filtered._data.nrows + ca = ColumnAccessor( { - name: filtered._data[name]._get_mask_as_column() - if filtered._data[name].nullable - else as_column(True, length=len(filtered._data[name])) - for name in filtered._column_names - } + name: col._get_mask_as_column() + if col.nullable + else as_column(True, length=length) + for name, col in filtered._data.items() + }, + verify=False, ) + mask = DataFrame._from_data(ca) mask = mask.all(axis=1) else: mask = None @@ -6679,19 +6720,10 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) return Series._from_column(result, index=self.index) else: - result_df = DataFrame(result).set_index(self.index) + result_df = DataFrame(result, index=self.index) result_df._set_columns_like(prepared._data) return result_df - @_performance_tracking - def _columns_view(self, columns): - """ - Return a subset of the DataFrame's columns as a view. - """ - return DataFrame( - {col: self._data[col] for col in columns}, index=self.index - ) - @_performance_tracking def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame's columns based on the column dtypes. @@ -6763,8 +6795,6 @@ def select_dtypes(self, include=None, exclude=None): if not isinstance(exclude, (list, tuple)): exclude = (exclude,) if exclude is not None else () - df = DataFrame(index=self.index) - # cudf_dtype_from_pydata_dtype can distinguish between # np.float and np.number selection = tuple(map(frozenset, (include, exclude))) @@ -6820,12 +6850,12 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._column_labels_and_values: - infered_type = cudf_dtype_from_pydata_dtype(col.dtype) - if infered_type in inclusion: - df._insert(len(df._data), k, col) - - return df + to_select = [ + label + for label, dtype in self._dtypes + if cudf_dtype_from_pydata_dtype(dtype) in inclusion + ] + return self.loc[:, to_select] @ioutils.doc_to_parquet() def to_parquet( @@ -7331,7 +7361,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False): cov = cupy.cov(self.values, ddof=ddof, rowvar=False) cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(cov)).set_index(cols) + df = DataFrame(cupy.asfortranarray(cov), index=cols) df._set_columns_like(self._data) return df @@ -7374,7 +7404,7 @@ def corr( corr = cupy.corrcoef(values, rowvar=False) cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) + df = DataFrame(cupy.asfortranarray(corr), index=cols) df._set_columns_like(self._data) return df diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 5250a741d3d..aa601a2b322 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -3,7 +3,7 @@ import enum from collections import abc -from typing import Any, Iterable, Mapping, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, cast import cupy as cp import numpy as np @@ -20,6 +20,9 @@ build_column, ) +if TYPE_CHECKING: + from collections.abc import Iterable, Mapping, Sequence + # Implementation of interchange protocol classes # ---------------------------------------------- @@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum): _DtypeKind.BOOL, _DtypeKind.STRING, } -ProtoDtype = Tuple[_DtypeKind, int, str, str] +ProtoDtype = tuple[_DtypeKind, int, str, str] class _CuDFBuffer: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 2110e610c37..8765a27a165 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -3,7 +3,6 @@ import decimal import operator -import pickle import textwrap import warnings from functools import cached_property @@ -91,13 +90,13 @@ def dtype(arbitrary): raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype") -def _decode_type( +def _check_type( cls: type, header: dict, frames: list, is_valid_class: Callable[[type, type], bool] = operator.is_, -) -> tuple[dict, list, type]: - """Decode metadata-encoded type and check validity +) -> None: + """Perform metadata-encoded type and check validity Parameters ---------- @@ -112,12 +111,6 @@ class performing deserialization serialization by `cls` (default is to check type equality), called as `is_valid_class(decoded_class, cls)`. - Returns - ------- - tuple - Tuple of validated headers, frames, and the decoded class - constructor. - Raises ------ AssertionError @@ -128,11 +121,10 @@ class performing deserialization f"Deserialization expected {header['frame_count']} frames, " f"but received {len(frames)}." ) - klass = pickle.loads(header["type-serialized"]) assert is_valid_class( - klass, cls + klass := Serializable._name_type_map[header["type-serialized-name"]], + cls, ), f"Header-encoded {klass=} does not match decoding {cls=}." - return header, frames, klass class _BaseDtype(ExtensionDtype, Serializable): @@ -305,13 +297,14 @@ def construct_from_string(self): def serialize(self): header = {} - header["type-serialized"] = pickle.dumps(type(self)) header["ordered"] = self.ordered frames = [] if self.categories is not None: - categories_header, categories_frames = self.categories.serialize() + categories_header, categories_frames = ( + self.categories.device_serialize() + ) header["categories"] = categories_header frames.extend(categories_frames) header["frame_count"] = len(frames) @@ -319,15 +312,14 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) ordered = header["ordered"] categories_header = header["categories"] categories_frames = frames - categories_type = pickle.loads(categories_header["type-serialized"]) - categories = categories_type.deserialize( + categories = Serializable.device_deserialize( categories_header, categories_frames ) - return klass(categories=categories, ordered=ordered) + return cls(categories=categories, ordered=ordered) def __repr__(self): return self.to_pandas().__repr__() @@ -495,12 +487,13 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header: dict[str, Dtype] = {} - header["type-serialized"] = pickle.dumps(type(self)) frames = [] if isinstance(self.element_type, _BaseDtype): - header["element-type"], frames = self.element_type.serialize() + header["element-type"], frames = ( + self.element_type.device_serialize() + ) else: header["element-type"] = getattr( self.element_type, "name", self.element_type @@ -510,14 +503,14 @@ def serialize(self) -> tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) if isinstance(header["element-type"], dict): - element_type = pickle.loads( - header["element-type"]["type-serialized"] - ).deserialize(header["element-type"], frames) + element_type = Serializable.device_deserialize( + header["element-type"], frames + ) else: element_type = header["element-type"] - return klass(element_type=element_type) + return cls(element_type=element_type) @cached_property def itemsize(self): @@ -641,7 +634,6 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) frames: list[Buffer] = [] @@ -649,33 +641,31 @@ def serialize(self) -> tuple[dict, list]: for k, dtype in self.fields.items(): if isinstance(dtype, _BaseDtype): - dtype_header, dtype_frames = dtype.serialize() + dtype_header, dtype_frames = dtype.device_serialize() fields[k] = ( dtype_header, (len(frames), len(frames) + len(dtype_frames)), ) frames.extend(dtype_frames) else: - fields[k] = pickle.dumps(dtype) + fields[k] = dtype.str header["fields"] = fields header["frame_count"] = len(frames) return header, frames @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) fields = {} for k, dtype in header["fields"].items(): if isinstance(dtype, tuple): dtype_header, (start, stop) = dtype - fields[k] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize( + fields[k] = Serializable.device_deserialize( dtype_header, frames[start:stop], ) else: - fields[k] = pickle.loads(dtype) + fields[k] = np.dtype(dtype) return cls(fields) @cached_property @@ -838,7 +828,6 @@ def _from_decimal(cls, decimal): def serialize(self) -> tuple[dict, list]: return ( { - "type-serialized": pickle.dumps(type(self)), "precision": self.precision, "scale": self.scale, "frame_count": 0, @@ -848,11 +837,8 @@ def serialize(self) -> tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type( - cls, header, frames, is_valid_class=issubclass - ) - klass = pickle.loads(header["type-serialized"]) - return klass(header["precision"], header["scale"]) + _check_type(cls, header, frames, is_valid_class=issubclass) + return cls(header["precision"], header["scale"]) def __eq__(self, other: Dtype) -> bool: if other is self: @@ -960,18 +946,17 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header = { - "type-serialized": pickle.dumps(type(self)), - "fields": pickle.dumps((self.subtype, self.closed)), + "fields": (self.subtype.str, self.closed), "frame_count": 0, } return header, [] @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) - klass = pickle.loads(header["type-serialized"]) - subtype, closed = pickle.loads(header["fields"]) - return klass(subtype, closed=closed) + _check_type(cls, header, frames) + subtype, closed = header["fields"] + subtype = np.dtype(subtype) + return cls(subtype, closed=closed) def _is_categorical_dtype(obj): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 37ad6b8fabb..f7af374ca8d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,10 +3,9 @@ from __future__ import annotations import operator -import pickle import warnings from collections import abc -from typing import TYPE_CHECKING, Any, Literal, MutableMapping +from typing import TYPE_CHECKING, Any, Literal # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. @@ -16,10 +15,13 @@ import pyarrow as pa from typing_extensions import Self +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 +from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -36,13 +38,14 @@ from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf if TYPE_CHECKING: + from collections.abc import MutableMapping from types import ModuleType from cudf._typing import Dtype, ScalarLike # TODO: It looks like Frame is missing a declaration of `copy`, need to add -class Frame(BinaryOperand, Scannable): +class Frame(BinaryOperand, Scannable, Serializable): """A collection of Column objects with an optional index. Parameters @@ -92,37 +95,80 @@ def ndim(self) -> int: @_performance_tracking def serialize(self): # TODO: See if self._data can be serialized outright + frames = [] header = { - "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(self._column_names), - "column_rangeindex": pickle.dumps(self._data.rangeindex), - "column_multiindex": pickle.dumps(self._data.multiindex), - "column_label_dtype": pickle.dumps(self._data.label_dtype), - "column_level_names": pickle.dumps(self._data._level_names), + "column_label_dtype": None, + "dtype-is-cudf-serialized": False, + } + if (label_dtype := self._data.label_dtype) is not None: + try: + header["column_label_dtype"], frames = ( + label_dtype.device_serialize() + ) + header["dtype-is-cudf-serialized"] = True + except AttributeError: + header["column_label_dtype"] = label_dtype.str + + header["columns"], column_frames = serialize_columns(self._columns) + column_names, column_names_numpy_type = ( + zip( + *[ + (cname.item(), type(cname).__name__) + if isinstance(cname, np.generic) + else (cname, "") + for cname in self._column_names + ] + ) + if self._column_names + else ((), ()) + ) + header |= { + "column_names": column_names, + "column_names_numpy_type": column_names_numpy_type, + "column_rangeindex": self._data.rangeindex, + "column_multiindex": self._data.multiindex, + "column_level_names": self._data._level_names, } - header["columns"], frames = serialize_columns(self._columns) + frames.extend(column_frames) + return header, frames @classmethod @_performance_tracking def deserialize(cls, header, frames): - cls_deserialize = pickle.loads(header["type-serialized"]) - column_names = pickle.loads(header["column_names"]) - columns = deserialize_columns(header["columns"], frames) kwargs = {} + dtype_header = header["column_label_dtype"] + if header["dtype-is-cudf-serialized"]: + count = dtype_header["frame_count"] + kwargs["label_dtype"] = cls.device_deserialize( + header, frames[:count] + ) + frames = frames[count:] + else: + kwargs["label_dtype"] = ( + np.dtype(dtype_header) if dtype_header is not None else None + ) + + columns = deserialize_columns(header["columns"], frames) for metadata in [ "rangeindex", "multiindex", - "label_dtype", "level_names", ]: key = f"column_{metadata}" if key in header: - kwargs[metadata] = pickle.loads(header[key]) + kwargs[metadata] = header[key] + + column_names = [ + getattr(np, cntype)(cname) if cntype != "" else cname + for cname, cntype in zip( + header["column_names"], header["column_names_numpy_type"] + ) + ] col_accessor = ColumnAccessor( data=dict(zip(column_names, columns)), **kwargs ) - return cls_deserialize._from_data(col_accessor) + return cls._from_data(col_accessor) @classmethod @_performance_tracking @@ -788,25 +834,28 @@ def _quantile_table( column_order=(), null_precedence=(), ): - interpolation = libcudf.types.Interpolation[interpolation] + interpolation = plc.types.Interpolation[interpolation] - is_sorted = libcudf.types.Sorted["YES" if is_sorted else "NO"] + is_sorted = plc.types.Sorted["YES" if is_sorted else "NO"] - column_order = [libcudf.types.Order[key] for key in column_order] + column_order = [plc.types.Order[key] for key in column_order] - null_precedence = [ - libcudf.types.NullOrder[key] for key in null_precedence - ] + null_precedence = [plc.types.NullOrder[key] for key in null_precedence] - return self._from_columns_like_self( - libcudf.quantiles.quantile_table( - [*self._columns], + with acquire_spill_lock(): + plc_table = plc.quantiles.quantiles( + plc.Table( + [c.to_pylibcudf(mode="read") for c in self._columns] + ), q, interpolation, is_sorted, column_order, null_precedence, - ), + ) + columns = libcudf.utils.columns_from_pylibcudf_table(plc_table) + return self._from_columns_like_self( + columns, column_names=self._column_names, ) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cb8cd0cd28b..a7ced1b833a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -3,12 +3,11 @@ import copy import itertools -import pickle import textwrap import warnings from collections import abc from functools import cached_property -from typing import TYPE_CHECKING, Any, Iterable, Literal +from typing import TYPE_CHECKING, Any, Literal import cupy as cp import numpy as np @@ -27,6 +26,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor +from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex @@ -35,6 +35,8 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: + from collections.abc import Iterable + from cudf._typing import ( AggType, DataFrameOrSeries, @@ -478,6 +480,11 @@ def get_group(self, name, obj=None): "instead of ``gb.get_group(name, obj=df)``.", FutureWarning, ) + if is_list_like(self._by): + if isinstance(name, tuple) and len(name) == 1: + name = name[0] + else: + raise KeyError(name) return obj.iloc[self.indices[name]] @_performance_tracking @@ -754,17 +761,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): left_cols = list(self.grouping.keys.drop_duplicates()._columns) right_cols = list(result_index._columns) join_keys = [ - _match_join_keys(lcol, rcol, "left") + _match_join_keys(lcol, rcol, "inner") for lcol, rcol in zip(left_cols, right_cols) ] # TODO: In future, see if we can centralize # logic else where that has similar patterns. join_keys = map(list, zip(*join_keys)) - _, indices = libcudf.join.join( - *join_keys, - how="left", + # By construction, left and right keys are related by + # a permutation, so we can use an inner join. + left_order, right_order = libcudf.join.join( + *join_keys, how="inner" + ) + # left order is some permutation of the ordering we + # want, and right order is a matching gather map for + # the result table. Get the correct order by sorting + # the right gather map. + (right_order,) = libcudf.sort.sort_by_key( + [right_order], + [left_order], + [True], + ["first"], + stable=False, + ) + result = result._gather( + GatherMap.from_column_unchecked( + right_order, len(result), nullify=False + ) ) - result = result.take(indices) if not self._as_index: result = result.reset_index() @@ -1241,7 +1264,7 @@ def serialize(self): obj_header, obj_frames = self.obj.serialize() header["obj"] = obj_header - header["obj_type"] = pickle.dumps(type(self.obj)) + header["obj_type_name"] = type(self.obj).__name__ header["num_obj_frames"] = len(obj_frames) frames.extend(obj_frames) @@ -1256,7 +1279,7 @@ def serialize(self): def deserialize(cls, header, frames): kwargs = header["kwargs"] - obj_type = pickle.loads(header["obj_type"]) + obj_type = Serializable._name_type_map[header["obj_type_name"]] obj = obj_type.deserialize( header["obj"], frames[: header["num_obj_frames"]] ) @@ -2232,6 +2255,22 @@ def func(x): return self.agg(func) + @_performance_tracking + def nunique(self, dropna: bool = True): + """ + Return number of unique elements in the group. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the counts. + """ + + def func(x): + return getattr(x, "nunique")(dropna=dropna) + + return self.agg(func) + @_performance_tracking def std( self, @@ -3264,8 +3303,8 @@ def _handle_misc(self, by): def serialize(self): header = {} frames = [] - header["names"] = pickle.dumps(self.names) - header["_named_columns"] = pickle.dumps(self._named_columns) + header["names"] = self.names + header["_named_columns"] = self._named_columns column_header, column_frames = cudf.core.column.serialize_columns( self._key_columns ) @@ -3275,8 +3314,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) + names = header["names"] + _named_columns = header["_named_columns"] key_columns = cudf.core.column.deserialize_columns( header["columns"], frames ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index cd07c58c5d9..244bd877c1a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3,12 +3,11 @@ from __future__ import annotations import operator -import pickle import warnings -from collections.abc import Hashable +from collections.abc import Hashable, MutableMapping from functools import cache, cached_property from numbers import Number -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numpy as np @@ -495,9 +494,8 @@ def serialize(self): header["index_column"]["step"] = self.step frames = [] - header["name"] = pickle.dumps(self.name) - header["dtype"] = pickle.dumps(self.dtype) - header["type-serialized"] = pickle.dumps(type(self)) + header["name"] = self.name + header["dtype"] = self.dtype.str header["frame_count"] = 0 return header, frames @@ -505,11 +503,14 @@ def serialize(self): @_performance_tracking def deserialize(cls, header, frames): h = header["index_column"] - name = pickle.loads(header["name"]) + name = header["name"] start = h["start"] stop = h["stop"] step = h.get("step", 1) - return RangeIndex(start=start, stop=stop, step=step, name=name) + dtype = np.dtype(header["dtype"]) + return RangeIndex( + start=start, stop=stop, step=step, dtype=dtype, name=name + ) @property # type: ignore @_performance_tracking diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5952815deef..9130779c3e9 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -10,9 +10,7 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, - MutableMapping, TypeVar, cast, ) @@ -23,7 +21,7 @@ import pandas as pd from typing_extensions import Self -import pylibcudf +import pylibcudf as plc import cudf import cudf._lib as libcudf @@ -63,6 +61,8 @@ from cudf.utils.utils import _warn_no_dask_cudf if TYPE_CHECKING: + from collections.abc import Callable, MutableMapping + from cudf._typing import ( ColumnLike, DataFrameOrSeries, @@ -2817,7 +2817,20 @@ def memory_usage(self, index=True, deep=False): """ raise NotImplementedError - def hash_values(self, method="murmur3", seed=None): + def hash_values( + self, + method: Literal[ + "murmur3", + "xxhash64", + "md5", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + ] = "murmur3", + seed: int | None = None, + ) -> cudf.Series: """Compute the hash of values in this column. Parameters @@ -2894,11 +2907,31 @@ def hash_values(self, method="murmur3", seed=None): "Provided seed value has no effect for the hash method " f"`{method}`. Only {seed_hash_methods} support seeds." ) - # Note that both Series and DataFrame return Series objects from this - # calculation, necessitating the unfortunate circular reference to the - # child class here. + with acquire_spill_lock(): + plc_table = plc.Table( + [c.to_pylibcudf(mode="read") for c in self._columns] + ) + if method == "murmur3": + plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed) + elif method == "xxhash64": + plc_column = plc.hashing.xxhash_64(plc_table, seed) + elif method == "md5": + plc_column = plc.hashing.md5(plc_table) + elif method == "sha1": + plc_column = plc.hashing.sha1(plc_table) + elif method == "sha224": + plc_column = plc.hashing.sha224(plc_table) + elif method == "sha256": + plc_column = plc.hashing.sha256(plc_table) + elif method == "sha384": + plc_column = plc.hashing.sha384(plc_table) + elif method == "sha512": + plc_column = plc.hashing.sha512(plc_table) + else: + raise ValueError(f"Unsupported hashing algorithm {method}.") + result = libcudf.column.Column.from_pylibcudf(plc_column) return cudf.Series._from_column( - libcudf.hash.hash([*self._columns], method, seed), + result, index=self.index, ) @@ -6270,7 +6303,7 @@ def rank( if method not in {"average", "min", "max", "first", "dense"}: raise KeyError(method) - method_enum = pylibcudf.aggregation.RankMethod[method.upper()] + method_enum = plc.aggregation.RankMethod[method.upper()] if na_option not in {"keep", "top", "bottom"}: raise ValueError( "na_option must be one of 'keep', 'top', or 'bottom'" diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 8182e5cede2..ce6a5c960dd 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -3,9 +3,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, List, Union - -from typing_extensions import TypeAlias +from typing import Any, TypeAlias import cudf from cudf.api.types import _is_scalar_or_zero_d_array, is_integer @@ -46,11 +44,11 @@ class ScalarIndexer: key: GatherMap -IndexingSpec: TypeAlias = Union[ - EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer -] +IndexingSpec: TypeAlias = ( + EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer +) -ColumnLabels: TypeAlias = List[str] +ColumnLabels: TypeAlias = list[str] def destructure_iloc_key( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6de3981ba66..a878b072860 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -5,10 +5,9 @@ import itertools import numbers import operator -import pickle import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, MutableMapping +from typing import TYPE_CHECKING, Any import cupy as cp import numpy as np @@ -36,7 +35,7 @@ from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name if TYPE_CHECKING: - from collections.abc import Generator, Hashable + from collections.abc import Generator, Hashable, MutableMapping from typing_extensions import Self @@ -700,7 +699,10 @@ def _compute_validity_mask(self, index, row_tuple, max_length): lookup_dict[i] = row lookup = cudf.DataFrame(lookup_dict) frame = cudf.DataFrame._from_data( - ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ColumnAccessor( + dict(enumerate(index._columns)), + verify=False, + ) ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) @@ -780,18 +782,12 @@ def _index_and_downcast(self, result, index, index_key): index_key = index_key[0] slice_access = isinstance(index_key, slice) - out_index = cudf.DataFrame() - # Select the last n-k columns where n is the number of columns and k is + # Count the last n-k columns where n is the number of columns and k is # the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) - for k in range(size, len(index._data)): - out_index.insert( - out_index._num_columns, - k, - cudf.Series._from_column(index._columns[k]), - ) + num_selected = max(0, index.nlevels - size) # determine if we should downcast from a DataFrame to a Series need_downcast = ( @@ -814,16 +810,13 @@ def _index_and_downcast(self, result, index, index_key): result = cudf.Series._from_data( {}, name=tuple(col[0] for col in index._columns) ) - elif out_index._num_columns == 1: + elif num_selected == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - last_column = index._columns[-1] - out_index = cudf.Index._from_column( - last_column, name=index.names[-1] - ) - index = out_index - elif out_index._num_columns > 1: + *_, last_column = index._data.columns + index = cudf.Index._from_column(last_column, name=index.names[-1]) + elif num_selected > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) @@ -924,15 +917,15 @@ def take(self, indices) -> Self: def serialize(self): header, frames = super().serialize() # Overwrite the names in _data with the true names. - header["column_names"] = pickle.dumps(self.names) + header["column_names"] = self.names return header, frames @classmethod @_performance_tracking def deserialize(cls, header, frames): # Spoof the column names to construct the frame, then set manually. - column_names = pickle.loads(header["column_names"]) - header["column_names"] = pickle.dumps(range(0, len(column_names))) + column_names = header["column_names"] + header["column_names"] = range(0, len(column_names)) obj = super().deserialize(header, frames) return obj._set_names(column_names) diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index e0aee28bfeb..391ee31f125 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -15,16 +15,18 @@ # limitations under the License. from __future__ import annotations -import pickle import warnings from typing import TYPE_CHECKING import numpy as np import pandas as pd +import pylibcudf as plc + import cudf -import cudf._lib.labeling -import cudf.core.index +from cudf._lib.column import Column +from cudf.core.abc import Serializable +from cudf.core.buffer import acquire_spill_lock from cudf.core.groupby.groupby import ( DataFrameGroupBy, GroupBy, @@ -48,7 +50,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) if len(self.grouping.bin_labels) != len(result): - index = cudf.core.index.Index( + index = cudf.Index( self.grouping.bin_labels, name=self.grouping.names[0] ) return result._align_to_index( @@ -95,21 +97,21 @@ def serialize(self): header, frames = super().serialize() grouping_head, grouping_frames = self.grouping.serialize() header["grouping"] = grouping_head - header["resampler_type"] = pickle.dumps(type(self)) + header["resampler_type"] = type(self).__name__ header["grouping_frames_count"] = len(grouping_frames) frames.extend(grouping_frames) return header, frames @classmethod def deserialize(cls, header, frames): - obj_type = pickle.loads(header["obj_type"]) + obj_type = Serializable._name_type_map[header["obj_type_name"]] obj = obj_type.deserialize( header["obj"], frames[: header["num_obj_frames"]] ) grouping = _ResampleGrouping.deserialize( header["grouping"], frames[header["num_obj_frames"] :] ) - resampler_cls = pickle.loads(header["resampler_type"]) + resampler_cls = Serializable._name_type_map[header["resampler_type"]] out = resampler_cls.__new__(resampler_cls) out.grouping = grouping super().__init__(out, obj, by=grouping) @@ -125,7 +127,7 @@ class SeriesResampler(_Resampler, SeriesGroupBy): class _ResampleGrouping(_Grouping): - bin_labels: cudf.core.index.Index + bin_labels: cudf.Index def __init__(self, obj, by=None, level=None): self._freq = getattr(by, "freq", None) @@ -161,8 +163,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) + names = header["names"] + _named_columns = header["_named_columns"] key_columns = cudf.core.column.deserialize_columns( header["columns"], frames[: -header["__bin_labels_count"]] ) @@ -170,7 +172,7 @@ def deserialize(cls, header, frames): out.names = names out._named_columns = _named_columns out._key_columns = key_columns - out.bin_labels = cudf.core.index.Index.deserialize( + out.bin_labels = cudf.Index.deserialize( header["__bin_labels"], frames[-header["__bin_labels_count"] :] ) out._freq = header["_freq"] @@ -268,13 +270,19 @@ def _handle_frequency_grouper(self, by): cast_bin_labels = bin_labels.astype(result_type) # bin the key column: - bin_numbers = cudf._lib.labeling.label_bins( - cast_key_column, - left_edges=cast_bin_labels[:-1]._column, - left_inclusive=(closed == "left"), - right_edges=cast_bin_labels[1:]._column, - right_inclusive=(closed == "right"), - ) + with acquire_spill_lock(): + plc_column = plc.labeling.label_bins( + cast_key_column.to_pylibcudf(mode="read"), + cast_bin_labels[:-1]._column.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES + if closed == "left" + else plc.labeling.Inclusive.NO, + cast_bin_labels[1:]._column.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES + if closed == "right" + else plc.labeling.Inclusive.NO, + ) + bin_numbers = Column.from_pylibcudf(plc_column) if label == "right": cast_bin_labels = cast_bin_labels[1:] diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 401fef67ee6..016bd1225cd 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -681,7 +681,7 @@ def _tile(A, reps): nval = len(value_vars) dtype = min_unsigned_type(nval) - if not var_name: + if var_name is None: var_name = "variable" if not value_vars: @@ -961,14 +961,18 @@ def _merge_sorted( ) -def _pivot(df, index, columns): +def _pivot( + col_accessor: ColumnAccessor, + index: cudf.Index | cudf.MultiIndex, + columns: cudf.Index | cudf.MultiIndex, +) -> cudf.DataFrame: """ Reorganize the values of the DataFrame according to the given index and columns. Parameters ---------- - df : DataFrame + col_accessor : DataFrame index : cudf.Index Index labels of the result columns : cudf.Index @@ -985,7 +989,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._column_labels_and_values: + for col_label, col in col_accessor.items(): names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1012,12 +1016,12 @@ def as_tuple(x): level_names=(None,) + columns._column_names, verify=False, ) - return cudf.DataFrame._from_data( - ca, index=cudf.Index(index_labels, name=index.name) - ) + return cudf.DataFrame._from_data(ca, index=index_labels) -def pivot(data, columns=None, index=no_default, values=no_default): +def pivot( + data: cudf.DataFrame, columns=None, index=no_default, values=no_default +) -> cudf.DataFrame: """ Return reshaped DataFrame organized by the given index and column values. @@ -1027,10 +1031,10 @@ def pivot(data, columns=None, index=no_default, values=no_default): Parameters ---------- - columns : column name, optional - Column used to construct the columns of the result. - index : column name, optional - Column used to construct the index of the result. + columns : scalar or list of scalars, optional + Column label(s) used to construct the columns of the result. + index : scalar or list of scalars, optional + Column label(s) used to construct the index of the result. values : column name or list of column names, optional Column(s) whose values are rearranged to produce the result. If not specified, all remaining columns of the DataFrame @@ -1067,27 +1071,48 @@ def pivot(data, columns=None, index=no_default, values=no_default): 2 three """ - df = data values_is_list = True if values is no_default: - values = df._columns_view( - col for col in df._column_names if col not in (index, columns) + already_selected = set( + itertools.chain( + [index] if is_scalar(index) else index, + [columns] if is_scalar(columns) else columns, + ) ) + cols_to_select = [ + col for col in data._column_names if col not in already_selected + ] + elif not isinstance(values, (list, tuple)): + cols_to_select = [values] + values_is_list = False else: - if not isinstance(values, (list, tuple)): - values = [values] - values_is_list = False - values = df._columns_view(values) + cols_to_select = values # type: ignore[assignment] if index is no_default: - index = df.index + index_data = data.index else: - index = cudf.Index(df.loc[:, index]) - columns = cudf.Index(df.loc[:, columns]) + index_data = data.loc[:, index] + if index_data.ndim == 2: + index_data = cudf.MultiIndex.from_frame(index_data) + if not is_scalar(index) and len(index) == 1: + # pandas converts single level MultiIndex to Index + index_data = index_data.get_level_values(0) + else: + index_data = cudf.Index(index_data) + + column_data = data.loc[:, columns] + if column_data.ndim == 2: + column_data = cudf.MultiIndex.from_frame(column_data) + else: + column_data = cudf.Index(column_data) # Create a DataFrame composed of columns from both # columns and index ca = ColumnAccessor( - dict(enumerate(itertools.chain(index._columns, columns._columns))), + dict( + enumerate( + itertools.chain(index_data._columns, column_data._columns) + ) + ), verify=False, ) columns_index = cudf.DataFrame._from_data(ca) @@ -1096,7 +1121,9 @@ def pivot(data, columns=None, index=no_default, values=no_default): if len(columns_index) != len(columns_index.drop_duplicates()): raise ValueError("Duplicate index-column pairs found. Cannot reshape.") - result = _pivot(values, index, columns) + result = _pivot( + data._data.select_by_label(cols_to_select), index_data, column_data + ) # MultiIndex to Index if not values_is_list: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index acd97c2047c..778db5973bf 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4,12 +4,11 @@ import functools import inspect -import pickle import textwrap import warnings from collections import abc from shutil import get_terminal_size -from typing import TYPE_CHECKING, Any, Literal, MutableMapping +from typing import TYPE_CHECKING, Any, Literal import cupy import numpy as np @@ -28,7 +27,6 @@ ) from cudf.core import indexing_utils from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -71,6 +69,8 @@ from cudf.utils.performance_tracking import _performance_tracking if TYPE_CHECKING: + from collections.abc import MutableMapping + import pyarrow as pa from cudf._typing import ( @@ -413,7 +413,7 @@ def _loc_to_iloc(self, arg): return indices -class Series(SingleColumnFrame, IndexedFrame, Serializable): +class Series(SingleColumnFrame, IndexedFrame): """ One-dimensional GPU array (including time series). @@ -637,10 +637,15 @@ def __init__( column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) if isinstance(data, (pd.Series, Series)): index_from_data = ensure_index(data.index) - elif isinstance(data, (ColumnAccessor, ColumnBase)): + elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " - "ColumnAccessor or a ColumnBase" + "ColumnAccessor" + ) + elif isinstance(data, ColumnBase): + raise TypeError( + "Use cudf.Series._from_column for constructing a Series from " + "a ColumnBase" ) elif isinstance(data, dict): if not data: @@ -893,7 +898,7 @@ def hasnans(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self.index.serialize() + header["index"], index_frames = self.index.device_serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -909,8 +914,7 @@ def deserialize(cls, header, frames): header, frames[header["index_frame_count"] :] ) - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + index = cls.device_deserialize(header["index"], frames[:index_nframes]) obj.index = index return obj @@ -2943,7 +2947,7 @@ def corr(self, other, method="pearson", min_periods=None): >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) >>> ser1.corr(ser2, method="pearson") - -0.20454263717316112 + -0.20454263717316126 >>> ser1.corr(ser2, method="spearman") -0.5 """ diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 9e59b134b73..dda1f199078 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -6,8 +6,9 @@ import cupy as cp +import pylibcudf as plc + from cudf._lib.nvtext.subword_tokenize import ( - Hashed_Vocabulary as cpp_hashed_vocabulary, subword_tokenize_inmem_hash as cpp_subword_tokenize, ) @@ -50,7 +51,9 @@ class SubwordTokenizer: def __init__(self, hash_file: str, do_lower_case: bool = True): self.do_lower_case = do_lower_case - self.vocab_file = cpp_hashed_vocabulary(hash_file) + self.vocab_file = plc.nvtext.subword_tokenize.HashedVocabulary( + hash_file + ) def __call__( self, diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index 99d85c0c5c0..1e31376cce8 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -2,9 +2,10 @@ from __future__ import annotations +import pylibcudf as plc + import cudf from cudf._lib.nvtext.tokenize import ( - TokenizeVocabulary as cpp_tokenize_vocabulary, tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, ) @@ -20,7 +21,9 @@ class TokenizeVocabulary: """ def __init__(self, vocabulary: "cudf.Series"): - self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column) + self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary( + vocabulary._column.to_pylibcudf(mode="read") + ) def tokenize( self, text, delimiter: str = "", default_id: int = -1 diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 68f34fa28ff..885e7b16644 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -4,7 +4,7 @@ import math import re import warnings -from typing import Literal, Sequence +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -20,6 +20,9 @@ from cudf.core import column from cudf.core.index import ensure_index +if TYPE_CHECKING: + from collections.abc import Sequence + # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 _unit_map = { "year": "year", diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 6cecf3fa170..9a22045ff78 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -11,6 +11,7 @@ from cudf import _lib as libcudf from cudf._lib import strings as libstrings from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype +from cudf.core._internals import unary from cudf.core.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.core.index import ensure_index @@ -171,7 +172,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): downcast_dtype = cudf.dtype(t) if downcast_dtype.itemsize <= col.dtype.itemsize: if col.can_cast_safely(downcast_dtype): - col = libcudf.unary.cast(col, downcast_dtype) + col = unary.cast(col, downcast_dtype) break if isinstance(arg, (cudf.Series, pd.Series)): diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index 43604ab21a7..a0cbe7ada19 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -99,7 +99,7 @@ def prepare_args(self, ty, val, **kwargs): ty.dtype, (StringView, UDFString) ): return types.uint64, val.ptr if isinstance( - val, rmm._lib.device_buffer.DeviceBuffer + val, rmm.pylibrmm.device_buffer.DeviceBuffer ) else val.get_ptr(mode="read") else: return ty, val diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index ef0f6958aeb..094df955273 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import Literal +from typing import TYPE_CHECKING, Literal import numpy as np @@ -10,6 +10,9 @@ from cudf.api.types import is_numeric_dtype from cudf.core.window.rolling import _RollingBase +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + class ExponentialMovingWindow(_RollingBase): r""" @@ -179,8 +182,10 @@ def cov( ): raise NotImplementedError("cov not yet supported.") - def _apply_agg_series(self, sr, agg_name): - if not is_numeric_dtype(sr.dtype): + def _apply_agg_column( + self, source_column: ColumnBase, agg_name: str + ) -> ColumnBase: + if not is_numeric_dtype(source_column.dtype): raise TypeError("No numeric types to aggregate") # libcudf ewm has special casing for nulls only @@ -188,20 +193,14 @@ def _apply_agg_series(self, sr, agg_name): # pandas does nans in the same positions mathematically. # as such we need to convert the nans to nulls before # passing them in. - to_libcudf_column = sr._column.astype("float64").nans_to_nulls() - - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self( - [ - scan( - agg_name, - to_libcudf_column, - True, - com=self.com, - adjust=self.adjust, - ) - ] - ) + to_libcudf_column = source_column.astype("float64").nans_to_nulls() + + return scan( + agg_name, + to_libcudf_column, + True, + com=self.com, + adjust=self.adjust, ) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 043a41145e5..967edc2ab15 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -2,6 +2,7 @@ from __future__ import annotations import warnings +from typing import TYPE_CHECKING import numba import pandas as pd @@ -16,25 +17,29 @@ from cudf.utils import cudautils from cudf.utils.utils import GetAttrGetItemMixin +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + class _RollingBase: """ - Contains methods common to all kinds of rolling + Contains routines to apply a window aggregation to a column. """ - def _apply_agg_dataframe(self, df, agg_name): - result_df = cudf.DataFrame({}) - for i, col_name in enumerate(df.columns): - result_col = self._apply_agg_series(df[col_name], agg_name) - result_df.insert(i, col_name, result_col) - result_df.index = df.index - return result_df + obj: cudf.DataFrame | cudf.Series - def _apply_agg(self, agg_name): - if isinstance(self.obj, cudf.Series): - return self._apply_agg_series(self.obj, agg_name) - else: - return self._apply_agg_dataframe(self.obj, agg_name) + def _apply_agg_column( + self, source_column: ColumnBase, agg_name: str + ) -> ColumnBase: + raise NotImplementedError + + def _apply_agg(self, agg_name: str) -> cudf.DataFrame | cudf.Series: + applied = ( + self._apply_agg_column(col, agg_name) for col in self.obj._columns + ) + return self.obj._from_data_like_self( + self.obj._data._from_columns_like_self(applied) + ) class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible): @@ -290,14 +295,6 @@ def _apply_agg_column(self, source_column, agg_name): agg_params=self.agg_params, ) - def _apply_agg(self, agg_name): - applied = ( - self._apply_agg_column(col, agg_name) for col in self.obj._columns - ) - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self(applied) - ) - def _reduce( self, op: str, diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index 964bd02b03e..11730e98c95 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -1,7 +1,9 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +import pylibcudf as plc + import cudf -from cudf import _lib as libcudf +from cudf._lib.utils import data_from_pylibcudf_io from cudf.utils import ioutils @@ -23,8 +25,19 @@ def read_avro( filepath_or_buffer, "read_avro" ) - return cudf.DataFrame._from_data( - *libcudf.avro.read_avro( - filepath_or_buffer, columns, skiprows, num_rows - ) + num_rows = -1 if num_rows is None else num_rows + skip_rows = 0 if skiprows is None else skiprows + + if not isinstance(num_rows, int) or num_rows < -1: + raise TypeError("num_rows must be an int >= -1") + if not isinstance(skip_rows, int) or skip_rows < 0: + raise TypeError("skip_rows must be an int >= 0") + + plc_result = plc.io.avro.read_avro( + plc.io.types.SourceInfo([filepath_or_buffer]), + columns, + skip_rows, + num_rows, ) + + return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result)) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index ce99f98b559..750c6cec180 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -368,6 +368,14 @@ def _process_dataset( file_list = paths if len(paths) == 1 and ioutils.is_directory(paths[0]): paths = ioutils.stringify_pathlike(paths[0]) + elif ( + filters is None + and isinstance(dataset_kwargs, dict) + and dataset_kwargs.get("partitioning") is None + ): + # Skip dataset processing if we have no filters + # or hive/directory partitioning to deal with. + return paths, row_groups, [], {} # Convert filters to ds.Expression if filters is not None: diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index df7bbe22a61..e206c8bca08 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -351,6 +351,22 @@ def _integer_and_none_validator(val): _make_contains_validator([False, True]), ) +_register_option( + "kvikio_remote_io", + _env_get_bool("CUDF_KVIKIO_REMOTE_IO", False), + textwrap.dedent( + """ + Whether to use KvikIO's remote IO backend or not. + \tWARN: this is experimental and may be removed at any time + \twithout warning or deprecation period. + \tSet KVIKIO_NTHREADS (default is 8) to change the number of + \tconcurrent tcp connections, which is important for good performance. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + class option_context(ContextDecorator): """ diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py index 66a51a83896..b801654068e 100644 --- a/python/cudf/cudf/pandas/_wrappers/common.py +++ b/python/cudf/cudf/pandas/_wrappers/common.py @@ -52,4 +52,13 @@ def array_interface(self: _FastSlowProxy): def custom_iter(self: _FastSlowProxy): - return iter(self._fsproxy_slow) + """ + Custom iter method to handle the case where only the slow + object's iter method is used. + """ + # NOTE: Do not remove this method. This is required to avoid + # falling back to GPU for iter method. + return _maybe_wrap_result( + iter(self._fsproxy_slow), + None, # type: ignore + ) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 6d03063fa27..05e7d159c63 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -75,13 +75,27 @@ def _pandas_util_dir(): # In pandas 2.0, pandas.util contains public APIs under # __getattr__ but no __dir__ to find them # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py - return list(importlib.import_module("pandas.util").__dict__.keys()) + [ - "hash_array", - "hash_pandas_object", - "Appender", - "Substitution", - "cache_readonly", - ] + res = list( + set( + list(importlib.import_module("pandas.util").__dict__.keys()) + + [ + "Appender", + "Substitution", + "_exceptions", + "_print_versions", + "cache_readonly", + "hash_array", + "hash_pandas_object", + "version", + "_tester", + "_validators", + "_decorators", + ] + ) + ) + if cudf.core._compat.PANDAS_GE_220: + res.append("capitalize_first_letter") + return res pd.util.__dir__ = _pandas_util_dir diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 0c1cda8810b..40893ee2614 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -10,12 +10,14 @@ import pickle import types import warnings -from collections.abc import Callable, Iterator +from collections.abc import Callable, Iterator, Mapping from enum import IntEnum -from typing import Any, Literal, Mapping +from typing import Any, Literal import numpy as np +from rmm import RMMError + from ..options import _env_get_bool from ..testing import assert_eq from .annotation import nvtx @@ -33,6 +35,20 @@ def call_operator(fn, args, kwargs): "EXECUTE_SLOW": 0x0571B0, } +# This is a dict of functions that are known to have arguments that +# need to be transformed from fast to slow only. i.e., Some cudf functions +# error on passing a device object but don't error on passing a host object. +# For example: DataFrame.__setitem__(arg, value) errors on passing a +# cudf.Index object but doesn't error on passing a pd.Index object. +# Hence we need to transform the arg from fast to slow only. So, we use +# a dictionary like: +# {"DataFrame.__setitem__": {0}} +# where the keys are the function names and the values are the indices +# (0-based) of the arguments that need to be transformed. + +_SPECIAL_FUNCTIONS_ARGS_MAP = { + "DataFrame.__setitem__": {0}, +} _WRAPPER_ASSIGNMENTS = tuple( attr @@ -875,18 +891,62 @@ def __name__(self, value): pass setattr(self._fsproxy_slow, "__name__", value) + @property + def _customqualname(self): + return self._fsproxy_slow.__qualname__ + def _assert_fast_slow_eq(left, right): if _is_final_type(type(left)) or type(left) in NUMPY_TYPES: assert_eq(left, right) -class ProxyFallbackError(Exception): - """Raised when fallback occurs""" +class FallbackError(Exception): + """Raises when fallback occurs""" + + pass + + +class OOMFallbackError(FallbackError): + """Raises when cuDF produces a MemoryError or an rmm.RMMError""" + + pass + + +class NotImplementedFallbackError(FallbackError): + """Raises cuDF produces a NotImplementedError""" + + pass + + +class AttributeFallbackError(FallbackError): + """Raises when cuDF produces an AttributeError""" + + pass + + +class TypeFallbackError(FallbackError): + """Raises when cuDF produces a TypeError""" pass +def _raise_fallback_error(err, name): + """Raises a fallback error.""" + err_message = f"Falling back to the slow path. The exception was {err}. \ + The function called was {name}." + exception_map = { + (RMMError, MemoryError): OOMFallbackError, + NotImplementedError: NotImplementedFallbackError, + AttributeError: AttributeFallbackError, + TypeError: TypeFallbackError, + } + for err_type, fallback_err_type in exception_map.items(): + if isinstance(err, err_type): + raise fallback_err_type(err_message) from err + raise FallbackError(err_message) from err + + def _fast_function_call(): """ Placeholder fast function for pytest profiling purposes. @@ -963,16 +1023,14 @@ def _fast_slow_function_call( f"The exception was {e}." ) except Exception as err: - if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False): - raise ProxyFallbackError( - f"The operation failed with cuDF, the reason was {type(err)}: {err}." - ) from err with nvtx.annotate( "EXECUTE_SLOW", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], domain="cudf_pandas", ): slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) + if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False): + _raise_fallback_error(err, slow_args[0].__name__) if _env_get_bool("LOG_FAST_FALLBACK", False): from ._logger import log_fallback @@ -1011,7 +1069,36 @@ def _transform_arg( # use __reduce_ex__ instead... if type(arg) is tuple: # Must come first to avoid infinite recursion - return tuple(_transform_arg(a, attribute_name, seen) for a in arg) + if ( + len(arg) > 0 + and isinstance(arg[0], _MethodProxy) + and arg[0]._customqualname in _SPECIAL_FUNCTIONS_ARGS_MAP + ): + indices_map = _SPECIAL_FUNCTIONS_ARGS_MAP[ + arg[0]._customqualname + ] + method_proxy, original_args, original_kwargs = arg + + original_args = tuple( + _transform_arg(a, "_fsproxy_slow", seen) + if i - 1 in indices_map + else _transform_arg(a, attribute_name, seen) + for i, a in enumerate(original_args) + ) + original_kwargs = _transform_arg( + original_kwargs, attribute_name, seen + ) + return tuple( + ( + _transform_arg(method_proxy, attribute_name, seen), + original_args, + original_kwargs, + ) + ) + else: + return tuple( + _transform_arg(a, attribute_name, seen) for a in arg + ) elif hasattr(arg, "__getnewargs_ex__"): # Partial implementation of to reconstruct with # transformed pieces @@ -1099,7 +1186,9 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: """ Wraps "result" in a fast-slow proxy if is a "proxiable" object. """ - if _is_final_type(result): + if isinstance(result, (int, str, float, bool, type(None))): + return result + elif _is_final_type(result): typ = get_final_type_map()[type(result)] return typ._fsproxy_wrap(result, func) elif _is_intermediate_type(result): diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index f82e300e83d..38103a71908 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -17,7 +17,7 @@ from abc import abstractmethod from importlib._bootstrap import _ImportLockContext as ImportLock from types import ModuleType -from typing import Any, ContextManager, NamedTuple +from typing import Any, ContextManager, NamedTuple # noqa: UP035 from typing_extensions import Self diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index 8870fbc5c28..bb2fc00d9fc 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -9,6 +9,7 @@ python analyze-test-failures.py Example: +------- python analyze-test-failures.py log.json frame/* """ diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index d12d2697729..59966a5ff0c 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs): @pytest.fixture(scope="session", autouse=True) # type: ignore def patch_testing_functions(): - tm.assert_produces_warning = null_assert_warnings + tm.assert_produces_warning = null_assert_warnings # noqa: F821 pytest.raises = replace_kwargs({"match": None})(pytest.raises) diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index 4ea0b3b4413..a0ad872e4c7 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -5,7 +5,8 @@ """ Summarizes the test results per module. -Examples: +Examples +-------- python summarize-test-results.py log.json python summarize-test-results.py log.json --output json python summarize-test-results.py log.json --output table diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 8cb9efa873c..a5dc8a5498c 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -92,7 +92,8 @@ def random_bitmask(size): number of bits """ sz = bitmask_allocation_size_bytes(size) - data = np.random.randint(0, 255, dtype="u1", size=sz) + rng = np.random.default_rng(seed=0) + data = rng.integers(0, 255, dtype="u1", size=sz) return data.view("i1") @@ -209,9 +210,10 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs): def gen_rand(dtype, size, **kwargs): + rng = np.random.default_rng(seed=kwargs.get("seed", 0)) dtype = cudf.dtype(dtype) if dtype.kind == "f": - res = np.random.random(size=size).astype(dtype) + res = rng.random(size=size).astype(dtype) if kwargs.get("positive_only", False): return res else: @@ -219,25 +221,23 @@ def gen_rand(dtype, size, **kwargs): elif dtype == np.int8 or dtype == np.int16: low = kwargs.get("low", -32) high = kwargs.get("high", 32) - return np.random.randint(low=low, high=high, size=size).astype(dtype) + return rng.integers(low=low, high=high, size=size).astype(dtype) elif dtype.kind == "i": low = kwargs.get("low", -10000) high = kwargs.get("high", 10000) - return np.random.randint(low=low, high=high, size=size).astype(dtype) + return rng.integers(low=low, high=high, size=size).astype(dtype) elif dtype == np.uint8 or dtype == np.uint16: low = kwargs.get("low", 0) high = kwargs.get("high", 32) - return np.random.randint(low=low, high=high, size=size).astype(dtype) + return rng.integers(low=low, high=high, size=size).astype(dtype) elif dtype.kind == "u": low = kwargs.get("low", 0) high = kwargs.get("high", 128) - return np.random.randint(low=low, high=high, size=size).astype(dtype) + return rng.integers(low=low, high=high, size=size).astype(dtype) elif dtype.kind == "b": low = kwargs.get("low", 0) high = kwargs.get("high", 2) - return np.random.randint(low=low, high=high, size=size).astype( - np.bool_ - ) + return rng.integers(low=low, high=high, size=size).astype(np.bool_) elif dtype.kind == "M": low = kwargs.get("low", 0) time_unit, _ = np.datetime_data(dtype) @@ -246,14 +246,14 @@ def gen_rand(dtype, size, **kwargs): int(1e18) / _unit_to_nanoseconds_conversion[time_unit], ) return pd.to_datetime( - np.random.randint(low=low, high=high, size=size), unit=time_unit + rng.integers(low=low, high=high, size=size), unit=time_unit ) elif dtype.kind in ("O", "U"): low = kwargs.get("low", 10) high = kwargs.get("high", 11) - nchars = np.random.randint(low=low, high=high, size=1)[0] + nchars = rng.integers(low=low, high=high, size=1)[0] char_options = np.array(list(string.ascii_letters + string.digits)) - all_chars = "".join(np.random.choice(char_options, nchars * size)) + all_chars = "".join(rng.choice(char_options, nchars * size)) return np.array( [all_chars[nchars * i : nchars * (i + 1)] for i in range(size)] ) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 13c194d6be0..99b686406fb 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -48,16 +48,22 @@ def __init__( self, cardinality=100, null_frequency=0.1, - generator=lambda: [ - _generate_string(string.ascii_letters, random.randint(4, 8)) - for _ in range(100) - ], + generator=None, is_sorted=True, dtype=None, ): self.cardinality = cardinality self.null_frequency = null_frequency - self.generator = generator + if generator is None: + rng = np.random.default_rng(seed=0) + self.generator = lambda: [ + _generate_string( + string.ascii_letters, rng, rng.integers(4, 8).item() + ) + for _ in range(100) + ] + else: + self.generator = generator self.is_sorted = is_sorted self.dtype = dtype @@ -96,7 +102,7 @@ def _write(tbl, path, format): tbl.to_parquet(path, row_group_size=format["row_group_size"]) -def _generate_column(column_params, num_rows): +def _generate_column(column_params, num_rows, rng): # If cardinality is specified, we create a set to sample from. # Otherwise, we simply use the given generator to generate each value. @@ -115,10 +121,8 @@ def _generate_column(column_params, num_rows): ) return pa.DictionaryArray.from_arrays( dictionary=vals, - indices=np.random.randint( - low=0, high=len(vals), size=num_rows - ), - mask=np.random.choice( + indices=rng.integers(low=0, high=len(vals), size=num_rows), + mask=rng.choice( [True, False], size=num_rows, p=[ @@ -142,7 +146,7 @@ def _generate_column(column_params, num_rows): column_params.generator, names=column_params.dtype.fields.keys(), mask=pa.array( - np.random.choice( + rng.choice( [True, False], size=num_rows, p=[ @@ -163,10 +167,10 @@ def _generate_column(column_params, num_rows): type=arrow_type, ) vals = pa.array( - np.random.choice(column_params.generator, size=num_rows) + rng.choice(column_params.generator, size=num_rows) if isinstance(arrow_type, pa.lib.Decimal128Type) - else np.random.choice(vals, size=num_rows), - mask=np.random.choice( + else rng.choice(vals, size=num_rows), + mask=rng.choice( [True, False], size=num_rows, p=[ @@ -189,7 +193,7 @@ def _generate_column(column_params, num_rows): # Generate data for current column return pa.array( column_params.generator, - mask=np.random.choice( + mask=rng.choice( [True, False], size=num_rows, p=[ @@ -233,7 +237,9 @@ def generate( def get_dataframe(parameters, use_threads): # Initialize seeds if parameters.seed is not None: - np.random.seed(parameters.seed) + rng = np.random.default_rng(seed=parameters.seed) # noqa: F841 + else: + rng = np.random.default_rng(seed=0) # noqa: F841 # For each column, invoke the data generator for column_params in parameters.column_parameters: @@ -281,14 +287,16 @@ def get_dataframe(parameters, use_threads): if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column( - column_params, parameters.num_rows + column_params, + parameters.num_rows, + rng, ) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [ - (column_params, parameters.num_rows) + (column_params, parameters.num_rows, rng) for i, column_params in enumerate(parameters.column_parameters) ], ) @@ -336,7 +344,7 @@ def rand_dataframe( """ # Apply seed random.seed(seed) - np.random.seed(seed) + rng = np.random.default_rng(seed=seed) column_params = [] for meta in dtypes_meta: @@ -348,7 +356,7 @@ def rand_dataframe( lists_max_length = meta["lists_max_length"] nesting_max_depth = meta["nesting_max_depth"] value_type = meta["value_type"] - nesting_depth = np.random.randint(1, nesting_max_depth) + nesting_depth = rng.integers(1, nesting_max_depth) dtype = cudf.core.dtypes.ListDtype(value_type) @@ -368,6 +376,7 @@ def rand_dataframe( size=cardinality, nesting_depth=nesting_depth, lists_max_length=lists_max_length, + rng=rng, ), is_sorted=False, dtype=dtype, @@ -377,10 +386,11 @@ def rand_dataframe( nesting_max_depth = meta["nesting_max_depth"] max_types_at_each_level = meta["max_types_at_each_level"] max_null_frequency = meta["max_null_frequency"] - nesting_depth = np.random.randint(1, nesting_max_depth) + nesting_depth = rng.integers(1, nesting_max_depth) structDtype = create_nested_struct_type( max_types_at_each_level=max_types_at_each_level, nesting_level=nesting_depth, + rng=rng, ) column_params.append( @@ -392,6 +402,7 @@ def rand_dataframe( cardinality=cardinality, size=rows, max_null_frequency=max_null_frequency, + rng=rng, ), is_sorted=False, dtype=structDtype, @@ -401,14 +412,16 @@ def rand_dataframe( max_precision = meta.get( "max_precision", cudf.Decimal64Dtype.MAX_PRECISION ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) + precision = rng.integers(1, max_precision) + scale = rng.integers(0, precision) dtype = cudf.Decimal64Dtype(precision=precision, scale=scale) column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), + generator=decimal_generator( + dtype=dtype, size=cardinality, rng=rng + ), is_sorted=False, dtype=dtype, ) @@ -417,14 +430,16 @@ def rand_dataframe( max_precision = meta.get( "max_precision", cudf.Decimal32Dtype.MAX_PRECISION ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) + precision = rng.integers(1, max_precision) + scale = rng.integers(0, precision) dtype = cudf.Decimal32Dtype(precision=precision, scale=scale) column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), + generator=decimal_generator( + dtype=dtype, size=cardinality, rng=rng + ), is_sorted=False, dtype=dtype, ) @@ -433,14 +448,16 @@ def rand_dataframe( max_precision = meta.get( "max_precision", cudf.Decimal128Dtype.MAX_PRECISION ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) + precision = rng.integers(1, max_precision) + scale = rng.integers(0, precision) dtype = cudf.Decimal128Dtype(precision=precision, scale=scale) column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), + generator=decimal_generator( + dtype=dtype, size=cardinality, rng=rng + ), is_sorted=False, dtype=dtype, ) @@ -469,6 +486,7 @@ def rand_dataframe( size=cardinality, min_bound=meta.get("min_bound", None), max_bound=meta.get("max_bound", None), + rng=rng, ), is_sorted=False, dtype=dtype, @@ -484,6 +502,7 @@ def rand_dataframe( size=cardinality, min_bound=meta.get("min_bound", None), max_bound=meta.get("max_bound", None), + rng=rng, ), is_sorted=False, dtype=dtype, @@ -497,7 +516,8 @@ def rand_dataframe( generator=lambda cardinality=cardinality: [ _generate_string( string.printable, - np.random.randint( + rng, + rng.integers( low=0, high=meta.get("max_string_length", 1000), size=1, @@ -519,6 +539,7 @@ def rand_dataframe( size=cardinality, min_bound=meta.get("min_bound", None), max_bound=meta.get("max_bound", None), + rng=rng, ), is_sorted=False, dtype=cudf.dtype(dtype), @@ -534,6 +555,7 @@ def rand_dataframe( size=cardinality, min_bound=meta.get("min_bound", None), max_bound=meta.get("max_bound", None), + rng=rng, ), is_sorted=False, dtype=cudf.dtype(dtype), @@ -544,7 +566,7 @@ def rand_dataframe( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, - generator=boolean_generator(cardinality), + generator=boolean_generator(cardinality, rng), is_sorted=False, dtype=cudf.dtype(dtype), ) @@ -567,7 +589,7 @@ def rand_dataframe( return df -def int_generator(dtype, size, min_bound=None, max_bound=None): +def int_generator(dtype, size, rng, min_bound=None, max_bound=None): """ Generator for int data """ @@ -577,7 +599,7 @@ def int_generator(dtype, size, min_bound=None, max_bound=None): iinfo = np.iinfo(dtype) low, high = iinfo.min, iinfo.max - return lambda: np.random.randint( + return lambda: rng.integers( low=low, high=high, size=size, @@ -585,13 +607,13 @@ def int_generator(dtype, size, min_bound=None, max_bound=None): ) -def float_generator(dtype, size, min_bound=None, max_bound=None): +def float_generator(dtype, size, rng, min_bound=None, max_bound=None): """ Generator for float data """ if min_bound is not None and max_bound is not None: low, high = min_bound, max_bound - return lambda: np.random.uniform( + return lambda: rng.uniform( low=low, high=high, size=size, @@ -599,7 +621,7 @@ def float_generator(dtype, size, min_bound=None, max_bound=None): else: finfo = np.finfo(dtype) return ( - lambda: np.random.uniform( + lambda: rng.uniform( low=finfo.min / 2, high=finfo.max / 2, size=size, @@ -608,7 +630,7 @@ def float_generator(dtype, size, min_bound=None, max_bound=None): ) -def datetime_generator(dtype, size, min_bound=None, max_bound=None): +def datetime_generator(dtype, size, rng, min_bound=None, max_bound=None): """ Generator for datetime data """ @@ -618,14 +640,14 @@ def datetime_generator(dtype, size, min_bound=None, max_bound=None): iinfo = np.iinfo("int64") low, high = iinfo.min + 1, iinfo.max - return lambda: np.random.randint( + return lambda: rng.integers( low=np.datetime64(low, "ns").astype(dtype).astype("int"), high=np.datetime64(high, "ns").astype(dtype).astype("int"), size=size, ) -def timedelta_generator(dtype, size, min_bound=None, max_bound=None): +def timedelta_generator(dtype, size, rng, min_bound=None, max_bound=None): """ Generator for timedelta data """ @@ -635,25 +657,25 @@ def timedelta_generator(dtype, size, min_bound=None, max_bound=None): iinfo = np.iinfo("int64") low, high = iinfo.min + 1, iinfo.max - return lambda: np.random.randint( + return lambda: rng.integers( low=np.timedelta64(low, "ns").astype(dtype).astype("int"), high=np.timedelta64(high, "ns").astype(dtype).astype("int"), size=size, ) -def boolean_generator(size): +def boolean_generator(size, rng): """ Generator for bool data """ - return lambda: np.random.choice(a=[False, True], size=size) + return lambda: rng.choice(a=[False, True], size=size) -def decimal_generator(dtype, size): +def decimal_generator(dtype, size, rng): max_integral = 10 ** (dtype.precision - dtype.scale) - 1 max_float = (10**dtype.scale - 1) if dtype.scale != 0 else 0 return lambda: ( - np.random.uniform( + rng.uniform( low=-max_integral, high=max_integral + (max_float / 10**dtype.scale), size=size, @@ -661,32 +683,33 @@ def decimal_generator(dtype, size): ) -def get_values_for_nested_data(dtype, lists_max_length=None, size=None): +def get_values_for_nested_data(dtype, rng, lists_max_length=None, size=None): """ Returns list of values based on dtype. """ if size is None: - cardinality = np.random.randint(0, lists_max_length) + cardinality = rng.integers(0, lists_max_length) else: cardinality = size dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): - values = int_generator(dtype=dtype, size=cardinality)() + values = int_generator(dtype=dtype, size=cardinality, rng=rng)() elif dtype.kind == "f": - values = float_generator(dtype=dtype, size=cardinality)() + values = float_generator(dtype=dtype, size=cardinality, rng=rng)() elif dtype.kind in ("U", "O"): values = [ _generate_string( string.printable, + rng, 100, ) for _ in range(cardinality) ] elif dtype.kind == "M": - values = datetime_generator(dtype=dtype, size=cardinality)().astype( - dtype - ) + values = datetime_generator( + dtype=dtype, size=cardinality, rng=rng + )().astype(dtype) elif dtype.kind == "m": values = timedelta_generator(dtype=dtype, size=cardinality)().astype( dtype @@ -699,14 +722,14 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None): return values -def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): +def make_lists(dtype, lists_max_length, nesting_depth, top_level_list, rng): """ Helper to create random list of lists with `nesting_depth` and specified value type `dtype`. """ nesting_depth -= 1 if nesting_depth >= 0: - L = np.random.randint(1, lists_max_length) + L = rng.integers(1, lists_max_length) for i in range(L): top_level_list.append( make_lists( @@ -714,11 +737,14 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): lists_max_length=lists_max_length, nesting_depth=nesting_depth, top_level_list=[], + rng=rng, ) ) else: top_level_list = get_values_for_nested_data( - dtype=dtype, lists_max_length=lists_max_length + dtype=dtype, + lists_max_length=lists_max_length, + rng=rng, ) # To ensure numpy arrays are not passed as input to # list constructor, returning a python list object here. @@ -728,22 +754,22 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): return top_level_list -def make_array_for_struct(dtype, cardinality, size, max_null_frequency): +def make_array_for_struct(dtype, cardinality, size, max_null_frequency, rng): """ Helper to create a pa.array with `size` and `dtype` for a `StructArray`. """ - null_frequency = np.random.uniform(low=0, high=max_null_frequency) - local_cardinality = max(np.random.randint(low=0, high=cardinality), 1) + null_frequency = rng.uniform(low=0, high=max_null_frequency) + local_cardinality = max(rng.integers(low=0, high=cardinality), 1) data = get_values_for_nested_data( - dtype=dtype.type.to_pandas_dtype(), size=local_cardinality + dtype=dtype.type.to_pandas_dtype(), size=local_cardinality, rng=rng ) - vals = np.random.choice(data, size=size) + vals = rng.choice(data, size=size) return pa.array( vals, - mask=np.random.choice( + mask=rng.choice( [True, False], size=size, p=[null_frequency, 1 - null_frequency], @@ -756,7 +782,7 @@ def make_array_for_struct(dtype, cardinality, size, max_null_frequency): ) -def get_nested_lists(dtype, size, nesting_depth, lists_max_length): +def get_nested_lists(dtype, size, nesting_depth, lists_max_length, rng): """ Returns a list of nested lists with random nesting depth and random nested lists length. @@ -770,13 +796,14 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length): lists_max_length=lists_max_length, nesting_depth=nesting_depth, top_level_list=[], + rng=rng, ) ) return list_of_lists -def get_nested_structs(dtype, cardinality, size, max_null_frequency): +def get_nested_structs(dtype, cardinality, size, max_null_frequency, rng): """ Returns a list of arrays with random data corresponding to the dtype provided. @@ -787,7 +814,7 @@ def get_nested_structs(dtype, cardinality, size, max_null_frequency): for name, col_dtype in dtype.fields.items(): if isinstance(col_dtype, cudf.StructDtype): result_arrays = get_nested_structs( - col_dtype, cardinality, size, max_null_frequency + col_dtype, cardinality, size, max_null_frequency, rng ) result_arrays = pa.StructArray.from_arrays( result_arrays, names=col_dtype.fields.keys() @@ -798,13 +825,14 @@ def get_nested_structs(dtype, cardinality, size, max_null_frequency): cardinality=cardinality, size=size, max_null_frequency=max_null_frequency, + rng=rng, ) list_of_arrays.append(result_arrays) return list_of_arrays -def list_generator(dtype, size, nesting_depth, lists_max_length): +def list_generator(dtype, size, nesting_depth, lists_max_length, rng): """ Generator for list data """ @@ -813,10 +841,11 @@ def list_generator(dtype, size, nesting_depth, lists_max_length): size=size, nesting_depth=nesting_depth, lists_max_length=lists_max_length, + rng=rng, ) -def struct_generator(dtype, cardinality, size, max_null_frequency): +def struct_generator(dtype, cardinality, size, max_null_frequency, rng): """ Generator for struct data """ @@ -825,25 +854,26 @@ def struct_generator(dtype, cardinality, size, max_null_frequency): cardinality=cardinality, size=size, max_null_frequency=max_null_frequency, + rng=rng, ) -def create_nested_struct_type(max_types_at_each_level, nesting_level): +def create_nested_struct_type(max_types_at_each_level, nesting_level, rng): dtypes_list = cudf.utils.dtypes.ALL_TYPES - picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level) + picked_types = rng.choice(list(dtypes_list), max_types_at_each_level) type_dict = {} for name, type_ in enumerate(picked_types): if type_ == "struct": type_dict[str(name)] = create_nested_struct_type( - max_types_at_each_level, nesting_level - 1 + max_types_at_each_level, nesting_level - 1, rng ) else: type_dict[str(name)] = cudf.dtype(type_) return cudf.StructDtype(type_dict) -def _generate_string(str_seq: str, length: int = 10) -> str: - return "".join(random.choices(str_seq, k=length)) +def _generate_string(str_seq: str, rng, length: int = 10) -> str: + return "".join(rng.choice(list(str_seq), size=length)) def _unique_string() -> str: diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 668e7a77454..8d342f8e6c6 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -10,8 +10,8 @@ from pandas import testing as tm import cudf -from cudf._lib.unary import is_nan from cudf.api.types import is_numeric_dtype, is_string_dtype +from cudf.core._internals.unary import is_nan from cudf.core.missing import NA, NaT diff --git a/python/cudf/cudf/tests/data/parquet/replace_multiple_rows.parquet b/python/cudf/cudf/tests/data/parquet/replace_multiple_rows.parquet new file mode 100644 index 00000000000..fc3ea55b021 Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/replace_multiple_rows.parquet differ diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl index 1ec077d10f7..64e06f0631d 100644 Binary files a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl and b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl differ diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt index 84b13c9d946..566ac2c337d 100644 --- a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt +++ b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt @@ -1,4382 +1,4382 @@ -26899 -27424 +19535 +9039 875 -7428432802425011718 0 -5054974408289448963 6 -18358444369622338053 9 -5716902217424485892 14 -8236612966193239043 18 -15282833726017872390 21 -15533348956988973570 27 -9001315167781089284 29 -7621090240282984451 33 -15337888141402371590 36 -16169070283077377537 42 -15615300272936709634 43 -12338784885023498756 45 -3175624061711419395 49 -9436392785812228615 52 -12978641027296058883 59 -14468815760709033991 62 -15607694490571932163 69 -53295083356623878 72 -0 78 -2230148770582976004 78 -6120456721458209796 82 -15411373208619074054 86 -10274574020114097153 92 -9000294930530661890 93 -13031557903172483076 95 -11350066664294002181 99 -6325605033787362307 104 -2909954277284188676 107 -4104562716099355138 111 -3267092979937387012 113 -17525453481571210244 117 -11532627846208440834 121 -10784672185103672321 123 -11229796758348255749 124 -4379577250247562242 129 -1041161126836283908 131 -3854383966527313413 135 -16467720483237810694 140 -14820844471735454722 146 -13111220924289178119 148 -2548683052821249538 155 -719749806464434178 157 -2121722119826170883 159 -9005614210949580292 162 -7050169108294333445 166 -17351764915062575107 171 -14644698505496219141 174 -11657834349296686081 179 -13626797927783164930 180 -14735048589438940164 182 -1078491261937017863 186 -7952761372439242754 193 -7692446865301965827 195 -4552111108816020995 198 -12455022990418032132 201 -1123962659471997957 205 -3056549312838577156 210 -1025661670765243906 214 -5397331336358247944 216 -7810366437124875782 224 -1195318972358038531 230 -7079722807026103811 233 -2524512050942986248 236 -1208593608912656389 244 -458260789232344578 249 -13194777122325112327 251 -5922704468287492 258 -11746235869336195079 262 -8611574268876189188 269 -7889840228953421829 273 -16998721522558936068 278 -6703563424903621638 282 -8885848295085850114 288 -13776273837475230211 290 -6036043703810622467 293 -2006225773287659526 296 -14202467530861800964 302 -7157057020317447684 306 -16885485872491802629 310 -12800303798361952772 315 -621325108927868418 319 -16727475898656483841 321 -6890112792805515778 322 -2421332377941126151 324 -16243404411124196356 331 -179400401794890244 335 -2630159406474274819 339 -1306609735592145925 342 -14908020842914311174 347 -1684452927247835651 353 -9400495923215416322 356 -8041860727239247878 358 -5619270496913133574 364 -2985476283152588291 370 -18150632792370312198 373 -13075355875451793410 379 -7596576612263365635 381 -7174955249282660868 384 -2272878747426984963 388 -9645618748109430277 391 -5995177571885476868 396 -16871713338758691845 400 -11801224416933808644 405 -15551192014010130949 409 -8196030292452405250 414 -4794784530053649411 416 -68047322062825475 419 -10163451915097363972 422 -4366630365820669955 426 -9174613115382159879 429 -17673253091692480002 436 -10710744348807818249 438 -6301209632168211460 447 -6557199531177304066 451 -10370980735304160259 453 -2426040420413965827 456 -18123352379522220547 459 -15891150425892429319 462 -16507447417454265351 469 -487708338428237827 476 -14107089365716616196 479 -747857609528251395 483 -17357876987202521607 486 -321005419951863300 493 -703083947315053061 497 -0 502 -17149635587492691460 502 -8277651075246678020 506 -1819886593879462403 510 -13106328552418381315 513 -17519686381941948418 516 -10696099526822671877 518 -4627984173327437314 523 -2628632462897246722 525 -3686397216490033667 527 -6617920799692924934 530 -6679301623707790339 536 -2596030458845084674 539 -13288938917088308226 541 -8348492885671808517 543 -6252009608718840325 548 -5807005916268695559 553 -15382799971167504899 560 -14954638692016032262 563 -8963684459383523331 569 -2934745887866391556 572 -8236887590303639044 576 -2016330563068923911 580 -12976290063611676164 587 -9986513189506445831 591 -780378482699725318 598 -383862355994530823 604 -7511344867307093508 611 -1435616864863593988 615 -12590979271693393411 619 -859813995721111047 622 -17910873098448224770 629 -16703366890805911553 631 -6922480979814889987 632 -8200210214462711297 635 -18382541080931060232 636 -12959023536126992897 644 -11055794376142651906 645 -8668012051305565187 647 -6795201209679524868 650 -3864186432644490244 654 -4574634299775772674 658 -2086703290536303619 660 -7145543127561014787 663 -9889572542971630085 666 -3510566585561691650 671 -10482036181312531460 673 -4296479271603189251 677 -17165580381790665732 680 -17931697598514948104 684 -5072138329769649158 692 -17857316349005986308 698 -1196313437880152072 702 -16094827446472526340 710 -6365083142954013701 714 -17639674970007880709 719 -1336948026798963208 724 -15719079816546418177 732 -453771991153695748 733 -15666021623592344581 737 -3887496731301423107 742 -16351565489992748547 745 -12913808626051103749 748 -9427161342471792643 753 -14610089064185748483 756 -11909740995340709890 759 -3386059367942955011 761 -7100313088634791944 764 -14954362273735097348 772 -5300343188950335490 776 -3306636399811602435 778 -15049176780536452612 781 -11478464585367391747 785 -4192691696663825924 788 -1724981527538165256 792 -8923121468991320579 800 -10407927314751914499 803 -4140577061391662082 806 -11024499228689010181 808 -11103397578962422789 813 -16103730809841527300 818 -2161511371026989571 822 -16905537098408481288 825 -14418359835235787780 833 -8643099440826274820 837 -15803230958149170691 841 -2270949347024239618 844 -16607521085023703556 846 -12520505897845165062 850 -10502193626894192132 856 -12350321094518214659 860 -4950437143309872131 863 -938542234576037889 866 -9547302901107668484 867 -7827404372121768966 871 -17757593377946824198 877 -13699186867246955524 883 -9859653826627356163 887 -16394835100035514883 890 -13800374264730731525 893 -16954635983094506500 898 -8015308433863798275 902 -858715644299290630 905 -4519655150699331077 911 -7134867591233050115 916 -6432786657037144579 919 -0 922 -9408341322832972291 922 -13653279902433200130 925 -1249019122170091524 927 -5444522055126761479 931 -18233734556082323457 938 -1838285473517654531 939 -10799019207790220804 942 -2448710159565130755 946 -18425837006146807297 949 -1384258267102048263 950 -6553795393861204486 957 -5022631533298058243 963 -2595435540421003780 966 -18298501952506793480 970 -17380720526409169413 978 -10291550905275666437 983 -8968303908578660869 988 -7762552109517888009 993 -12993351549860134403 1002 -13098482377540869636 1005 -17174134275815044100 1009 -2405939573849534980 1013 -11051603729345690626 1017 -2765842466801084934 1019 -13348255112383532037 1025 -4560899789258637829 1030 -17071422935680193539 1035 -11513452937230732294 1038 -1637355496640499203 1044 -14940739688966611972 1047 -8286559267538602502 1051 -6029036263825492484 1057 -6337648087046756355 1061 -12327119652833755139 1064 -7489768843341343236 1067 -17101806024406781955 1071 -1494687508867621385 1074 -915975103999953922 1083 -14731060910946571783 1085 -7993361195780195330 1092 -13688799604315935236 1094 -7328858946338903047 1098 -2913637027195678723 1105 -18189363439163655681 1108 -11261484070936291332 1109 -1244962005334571010 1113 -12618388435910808066 1115 -655187203027088898 1117 -1699259352638115337 1119 -9837815037477742085 1128 -10558465000768489987 1133 -3128326958710492164 1136 -16210393874387209731 1140 -3831602806328386054 1143 -1858477608543888899 1149 -11203849268139405826 1152 -14876215834473532933 1154 -838167957834962945 1159 -4472540425609859076 1160 -11410947109444917250 1164 -8435818218907397633 1166 -11045000766266457089 1167 -12325335880954441220 1168 -16708265953266297345 1172 -18342265362969646594 1173 -6953158344648897539 1175 -9922701673105435137 1178 -10113283973443524101 1179 -11668798096262926343 1184 -2129351334726026241 1191 -5692959118811792390 1192 -2917574127780044290 1198 -0 1200 -14420924818562740228 1200 -6098057863303978497 1204 -1252966646111680002 1205 -7111078464697947144 1207 -14144456899593720327 1215 -7367692118573781509 1222 -9319588592876439043 1227 -5212294342286609410 1230 -1600499660866511361 1232 -17579747388547180552 1233 -8365608306992954885 1241 -10307394306592963076 1246 -17092600292669807621 1250 -17030981925892977667 1255 -6929843536411176451 1258 -9908722951841282057 1261 -14685407131320535554 1270 -12861962652898171396 1272 -11958437143660911107 1276 -15904867421058229764 1279 -7283769647955500035 1283 -7872121678898447876 1286 -11726527760261815816 1290 -2316085662456682505 1298 -12840093831481137155 1307 -15574983692566917639 1310 -15176154862895929860 1317 -16186650646772958214 1321 -1965140296142659588 1327 -17362020270091437575 1331 -26356620300320263 1338 -4688323194808506371 1345 -470137109846916612 1348 -785647648524588041 1352 -686083037273571331 1361 -8705676087000994307 1364 -15985311040931325446 1367 -8848102120172622345 1373 -14900059783221505542 1382 -11611185676221023751 1388 -5823293000835959809 1395 -11173877492782561286 1396 -5985141512875075076 1402 -16607272189142469634 1406 -7000924871247012354 1408 -12796508861938638339 1410 -16352304696891085315 1413 -12654027566339262469 1416 -17652126895193709571 1421 -2059554016646703617 1424 -8824828815238545922 1425 -8026041213654553606 1427 -189105210507091461 1433 -8038465995762949635 1438 -0 1441 -4346653818095449092 1441 -13441396742193060358 1445 -5067771148519478785 1451 -210369551309682178 1452 -7856429334361659909 1454 -6456628847560069634 1459 -4777640967745320451 1461 -8983636279512822276 1464 -14568805960710332932 1468 -13817574021643753989 1472 -14625711259902278149 1477 -4632056779689710085 1482 -17613320542667293189 1487 -3172012402848437254 1492 -8040798394603101188 1498 -14064841209998140419 1502 -1914908168343121410 1505 -7368139610144548354 1507 -12868473585497306119 1509 -0 1516 -1618708134596732930 1516 -12587973098332420105 1518 -4964388169698209795 1527 -11644359715676310021 1530 -2644060095775605251 1535 -6430078223195648003 1538 -10183198452214045187 1541 -1240799682393062914 1544 -594310634075621378 1546 -2369514519273954820 1548 -10180653661786314245 1552 -954303650251543043 1557 -14430712698160791045 1560 -7362398115224322564 1565 -17170839233019868678 1569 -4334478792852912645 1575 -6976600872204725253 1580 -2757627166710815234 1585 -11581525848542896643 1587 -1902097979216049156 1590 -7092174838851165700 1594 -3776232881097953287 1598 -4956341896516184071 1605 -16560365104979398147 1612 -9985649880040289799 1615 -8870322153106933763 1622 -6905121755133908995 1625 -13368640352340902916 1628 -6681848478588709895 1632 -1825204937600832520 1639 -10492979809894170628 1647 -16021790814379410438 1651 -2537982728896871938 1657 -17110141827238231043 1659 -8972517116882764291 1662 -6878463938568223238 1665 -3653948979877717506 1671 -11414481194651397126 1673 -14522267179648162819 1679 -3098339502618796035 1682 -7079749050994126342 1685 -13571764215085394946 1691 -4748948606525397506 1693 -1577643399485818884 1695 -4080235243237779462 1699 -10874175738252140040 1705 -8407257242091918850 1713 -13208300770644489219 1715 -692428139842995202 1718 -1811883090719733762 1720 -9059362818280152070 1722 -1942856588307002885 1728 -8118332366482353665 1733 -4958069245857057284 1734 -14647311378680886789 1738 -10762024033896625670 1743 -28898254948429830 1749 -9834906317233815042 1755 -14985989359682912259 1757 -1282980713864208388 1760 -6063131598875265027 1764 -11171681444901584901 1767 -9942643440891227650 1772 -7536761905759707139 1774 -17586310513048226310 1777 -5368266791748388869 1783 -14231943828217691651 1788 -12518647321260815877 1791 -129394441281844743 1796 -2483490487411335170 1803 -654244401428041732 1805 -15646533714849457160 1809 -11807354932867949571 1817 -15902831808268765699 1820 -16275101253541722114 1823 -7489443708629377026 1825 -15395914353243975682 1827 -5617555619731661829 1829 -3134100206450675206 1834 -11607495136261988868 1840 -4974806308616426501 1844 -17446584074836170241 1849 -15686830167444742663 1850 -9706307518401206273 1857 -1668062460313515521 1858 -1175330870409010693 1859 -6316020408117881860 1864 -3926008952689808899 1868 -7412001888157663237 1871 -16350342416828571139 1876 -17722048717800707588 1879 -6638262866276511751 1883 -7428951476729761793 1890 -17816197047883941382 1891 -1346568064340942337 1897 -3701787015222295555 1898 -6659812133237486083 1901 -1828541539854978054 1904 -12379063259192634885 1910 -2611769333840765443 1915 -9618163593004828678 1918 -10135224491789939206 1924 -12979651712861326853 1930 -8882180359699969027 1935 -8839565787481092102 1938 -13328456084920556038 1944 -14232512278042323458 1950 -1868952656876792325 1952 -7567044498348088836 1957 -9878469525845452294 1961 -10877666723773861891 1967 -4437849393189355524 1970 -542122243470857732 1974 -4059190346138068994 1978 -14321675947144358916 1980 -14971180244834539009 1984 -7944574903635664900 1985 -6982417546170903047 1989 -9205813465909939715 1996 -14237044737088801799 1999 -636814072910696963 2006 -12520841226045264391 2009 -8898943418672995331 2016 -15646690259358356484 2019 -15618851112604340228 2023 -10285088843216830977 2027 -18286036510192394760 2028 -6450286360774949890 2036 -12025307250191760899 2038 -7044602746592181249 2041 -8270361223031661060 2042 -7199149542695273990 2046 -16798091800673956358 2052 -5285433079037354499 2058 -8498140496880657410 2061 -18434636390635965953 2063 -8780418579830073348 2064 -959965579978681347 2068 -2666650386212475906 2071 -4093783342266269185 2073 -7977153448080645638 2074 -3230317076849645570 2080 -2644129221999468547 2082 -7597431151331275265 2085 -6151418962808616963 2086 -16786361788616914434 2089 -9522044737514147334 2091 -15360350686533802498 2097 -4398995179394704386 2099 -4163122903470647302 2101 -18110267126768664070 2107 -17811600627481865731 2113 -11988559903619469315 2116 -5893679902922151940 2119 -3302430115655037445 2123 -2756050317441962502 2128 -7373324598575981572 2134 -15626353672087051269 2138 -9026268416534243843 2143 -5857105831257628164 2146 -11246462751297413124 2150 -7459631049065515526 2154 -2175352842263141379 2160 -9748465532031254533 2163 -12060676108130005507 2168 -8160425232164846593 2171 -1665947540125783558 2172 -10758171140537368580 2178 -5744770555727548418 2182 -15867521551313803780 2184 -11178209498970826244 2188 -2663862265833334277 2192 -646145646253570050 2197 -6886825228888300036 2199 -5219187155516171272 2203 -16142200027647465989 2211 -8727938199665870852 2216 -1200328579526163971 2220 -12449385538114001417 2223 -14632283715533800450 2232 -5295800027246062086 2234 -8827019094633400323 2240 -14543826221768176641 2243 -12388128316821831686 2244 -3087048392675298821 2250 -17669786912563615747 2255 -3879520399747123716 2258 -15648071975541157893 2262 -5580473107362200071 2267 -6895786389712974853 2274 -17709709086906012676 2279 -9627483233657542665 2283 -9602326803985618949 2292 -6748599026443758086 2297 -11488364339401397254 2303 -6716511183525677573 2309 -16003763240189186563 2314 -6003803301075291138 2317 -15800367754014516746 2319 -2817341800198731782 2329 -2110085916033252869 2335 -10353852055773781511 2340 -8745468498457416193 2347 -15197463976907486213 2348 -11844773108515011075 2353 -10745169896165544965 2356 -9502565595236673539 2361 -18340734722524717062 2364 -0 2370 -4877506240735029250 2370 -6632868101528461318 2372 -1094192348264738308 2378 -15930308455756352518 2382 -7517061312773919237 2388 -11537382714050522116 2393 -15343851421525887493 2397 -15685583084244037124 2402 -11443729733346354693 2406 -18096845502703148037 2411 -13060060807344890377 2416 -8226818503915081731 2425 -5171144332412330499 2428 -5367144440061049859 2431 -4687503341676126209 2434 -8115677569098133507 2435 -8753274682505368066 2438 -6767268893840927749 2440 -10747160183142327300 2445 -5318831768157948930 2449 -16744837601970291208 2451 -3968740997769839108 2459 -1041860322726726147 2463 -13185494599343868419 2466 -3781663100474830852 2469 -8664347289501861378 2473 -7145447006642560001 2475 -977858689003972101 2476 -188865761021926916 2481 -14781205616979726850 2485 -7514076159997088261 2487 -15227633270557658627 2492 -7486357174119883778 2495 -7899052859637422087 2497 -4312982947448530435 2504 -2484418012864310785 2507 -8450324929602980870 2508 -11374778755239228418 2514 -10780034123560756745 2516 -10313953391808102916 2525 -13836623279669341188 2529 -16297706918062760459 2533 -6404560275247226885 2544 -8323769790774729734 2549 -10061687257419431941 2555 -6724033317759518212 2560 -12265972209834273288 2564 -4748706107567735299 2572 -17588235414846031363 2575 -16029681841978911746 2578 -333014962274056196 2580 -2819861156000228870 2584 -17301319418358929926 2590 -14323022738651812355 2596 -17758251407482208260 2599 -9992216596142364674 2603 -5541911712511293955 2605 -1880849355295036931 2608 -15421034026101803523 2611 -2288503501826235907 2614 -2336333131728265731 2617 -15127408664422292997 2620 -6756061181968708102 2625 -2316367058427453443 2631 -13786932856453332482 2634 -17564157627292750852 2636 -5809790665868502019 2640 -9389430036410766853 2643 -15157257604368261123 2648 -523412383725034497 2651 -5270886391729814021 2652 -8987256414287503365 2657 -2751897370690544643 2662 -47819066577966599 2665 -9543124453318907909 2672 -15186331456703232514 2677 -9731347057535958023 2679 -6234700495105510914 2686 -17720066604242729989 2688 -611878128332703234 2693 -6029104170087404549 2695 -14612606995632327172 2700 -7357792311987945475 2704 -6074856230289873410 2707 -13368808999886628358 2709 -5918378978107988995 2715 -15624776793824203778 2718 -4241055509726121476 2720 -12687432015779367427 2724 -4003272975122620932 2727 -17483676776191982087 2731 -2701605488646040584 2738 -7387630099939362308 2746 -16331822462747681798 2750 -2197183442359868933 2756 -17624623361194542087 2761 -1749450990014992388 2768 -2888206094896619010 2772 -12985412669390948353 2774 -9843120678422464515 2775 -15590458610270713859 2778 -5950622975418741251 2781 -17607672802725530117 2784 -1225097419526011394 2789 -3758572251524375044 2791 -5891371767718009858 2795 -6843754938996156419 2797 -13418347525088883204 2800 -2887280155684756490 2804 -7867196614872225796 2814 -10992396837241625094 2818 -15526482250456426497 2824 -7582254907030848515 2825 -14309589056601523716 2828 -2843794758628944386 2832 -10106627892829635078 2834 -11117505412117820418 2840 -17559521087909430786 2842 -18410508844162253834 2844 -7796754440171003912 2854 -1826091018065355268 2862 -5568124937607335426 2866 -9164033835486570503 2868 -7917102923116225537 2875 -10708221634884163076 2876 -966446973350329348 2880 -1882776320247897092 2884 -18137433528115911172 2888 -7577505208556149252 2892 -3902521102041700356 2896 -11942362790107158020 2900 -2328713611561709573 2904 -8376513561567004165 2909 -18415012889800110091 2914 -7983446382889179652 2925 -2304166271864391689 2929 -708759182721729026 2938 -10774631175750681603 2940 -2608247964063907842 2943 -7317603117343176707 2945 -12615180422705001477 2948 -17995452459822326275 2953 -12439250137675515394 2956 -9947610136498965509 2958 -10340600516380348420 2963 -10073894039732477444 2967 -15954561361998232578 2971 -6039226287079734788 2973 -12684813664097613833 2977 -8337524429261820932 2986 -0 2990 -5738139389410570757 2990 -0 2995 -163262518049440773 2995 -11390362112332120070 3000 -7666496378417453571 3006 -17188351170280199170 3009 -14157925477049500677 3011 -16535316221715341826 3016 -701193705161007105 3018 -15417977144980853763 3019 -9623949443365348357 3022 -16537640731048440324 3027 -9880057250380779521 3031 -10507448958568448514 3032 -9901540867816521219 3034 -10882434502571251716 3037 -15939490563935542790 3041 -3818155241101528578 3047 -10810785028031231493 3049 -17268925026504538113 3054 -6000103580025957894 3055 -14492044616225970179 3061 -8964295197943843335 3064 -13244227239481936387 3071 -2072267724499101186 3074 -735562179013069826 3076 -3271477415853879302 3078 -1150251700717751812 3084 -11835839830005115393 3088 -17028480913889055238 3089 -16864969398419772420 3095 -9646252156141336066 3099 -5589333819644110342 3101 -14729039479109188098 3107 -2256025994407046148 3109 -5630416426912279555 3113 -23611161351524356 3116 -16061932977440933889 3120 -7560058124185071106 3121 -8943767870065516551 3123 -17388385529962317834 3130 -11686727589179028995 3140 -2993671307613155843 3143 -7451626547139373061 3146 -12726375988952098305 3151 -0 3152 -1735273330892205060 3152 -2746028049042776065 3156 -17093562035495421445 3157 -7598703106262353411 3162 -17526920923827930631 3165 -0 3172 -18087597149122765317 3172 -11336730259137625602 3177 -9704022087244797957 3179 -14531181144788964866 3184 -5103530438547424773 3186 -7049971328222257156 3191 -2593832991454060548 3195 -2549992206172832771 3199 -2656864556911864322 3202 -3094347590740453380 3204 -0 3208 -10556974365044028932 3208 -12597146506913681926 3212 -18243354473097630721 3218 -4168646291002030084 3219 -8893226051755120644 3223 -7904367695210051587 3227 -17247367703075879942 3230 -1338287165638264836 3236 -6734394253777139715 3240 -14645087877274778627 3243 -1841749727013933062 3246 -0 3252 -9793622484838288388 3252 -15384076833580083718 3256 -14678310837729104389 3262 -8947895455599830021 3267 -12421729442783160325 3272 -14382812703434878978 3277 -3484468606955360259 3279 -2411175954345499653 3282 -18322361710054416389 3287 -8989744845956541448 3292 -9637438279185886726 3300 -8282725403817063939 3306 -10727259769060221446 3309 -280860399088910340 3315 -3074647116268871172 3319 -9311932047626983431 3323 -2990333995786696707 3330 -11415454184475025922 3333 -8194042667332418565 3335 -11269986522125913093 3340 -10773634478079810565 3345 -0 3350 -4302235270674672643 3350 -4579270605621971460 3353 -3687011949425630213 3357 -9678333478858482691 3362 -14661606109051090440 3365 -9504123850532876291 3373 -14299233528797568008 3376 -10370491504729965060 3384 -286239823911254530 3388 -7969121812144744451 3390 -16606218867148559880 3393 -11756345184017143302 3401 -8204961944753809412 3407 -12456910480062157316 3411 -7569786299014196739 3415 -3372309516929818119 3418 -16631131943564946948 3425 -4436969913528429575 3429 -14467771002258720772 3436 -15278270405312088583 3440 -6638334178561090565 3447 -8154814430089498114 3452 -17289464348431017987 3454 -13185969354886446085 3457 -4725380864147687429 3462 -14933071000620043778 3467 -12471883028204926466 3469 -13286302152236950530 3471 -12020003522260348419 3473 -11784545509165047810 3476 -10311182359550097412 3478 -2262872037167824902 3482 -15672162207595698690 3488 -8479660175647360516 3490 -543122224331105283 3494 -8738610060644560897 3497 -15969479020845567490 3498 +0 0 +1196190418526572547 0 +3117251964976502276 3 +0 7 +3266452963994632202 7 +6701451810090115586 17 +10156473964989528067 19 +6270220596053033473 22 +8689732391113957377 23 +345423933508452359 24 +9048486634542125058 31 +13000119181766437380 33 +1008808785591799299 37 +12586249368236978177 40 +11161089178393358857 41 +0 50 +6900865085865625094 50 +2615908179610132483 56 +1617129254806601731 59 +1607892326666533378 62 +123501381755693059 64 +17180234710792039941 67 +17345742025318016002 72 +7933590365928361474 74 +16187522989672200717 76 +14893593683284454915 89 +6001767212789422083 92 +1805417936920808451 95 +8589625060174958594 98 +13148488988905702416 100 +6759231203841442819 116 +798806762886474754 119 +13949836854106156034 121 +4277844318153606661 123 +18162360468357982216 128 +17429735113921325570 136 +10428297564837543938 138 +10174389176493224450 140 +4782734429389924866 142 +16828613770926935558 144 +16924367891356487169 150 +15473269356473895940 151 +10277883249583756290 155 +7398921953351034881 157 +15672774546004063755 158 +7032338026028942337 169 +12638648541163088900 170 +11956890857542837252 174 +10813991647348979717 178 +698603259209416204 183 +104155371596876289 195 +8849883347580968451 196 +13523964487472320004 199 +12948374094552270339 203 +16624700721113753096 206 +0 214 +630014773304871940 214 +14669827911540386306 218 +16593543947487157254 220 +16189120489289924617 226 +5936869209199720450 235 +6504800368776816645 237 +17628010111075734529 242 +16073662248530872322 243 +15997624981342335497 245 +13519486007586370049 254 +469623719382726661 255 +10478598590185625089 260 +5239294057556035586 261 +17274642882001730567 263 +7924882265216651266 270 +13138720901108912133 272 +13741737182438464004 277 +14608811194009491970 281 +2489742908982890509 283 +14952279757728973318 296 +13432486964055121926 302 +15397241996877524995 308 +7400937882698838020 311 +13309132794101168654 315 +8519404085542453250 329 +2551722931538879493 331 +4492819152473235971 336 +9634175483270757380 339 +5023439465649179147 343 +2912624940235659267 354 +15615524075652075524 357 +15131856319265032196 361 +7560465986110364673 365 +16393161300057821706 366 +6737538541011470849 376 +6394493716971627523 377 +0 380 +6957953643235488257 380 +7533365794097524234 381 +11551517784611555841 391 +0 392 +14017003685401013761 392 +13868858036311946245 393 +609890416048967688 398 +15853752823436186626 406 +13008887538399190534 408 +275598997711474690 414 +612244017304434692 416 +265561555991638021 420 +0 425 +4771730300985403909 425 +14595656195986303489 430 +13010615142623560194 431 +3520044222049365512 433 +4843556531627173889 441 +9544321596489038851 442 +18097338319835691009 445 +17588488217883868161 446 +4553739803879796748 447 +12247953831639953411 459 +1685939678565356546 462 +2454121115370725890 464 +7699707784321416706 466 +2322428462912444939 468 +4251948422489921028 479 +8009626371771665409 483 +15830912148611917313 484 +15530208627603713027 485 +14550069280077337095 488 +3074860258671426050 495 +9819565310679728648 497 +0 505 +239920763215632386 505 +4479084686100589069 507 +7541436040510714881 520 +0 521 +18361828565940659201 521 +13943609537766478850 522 +1644071836581560844 524 +3325147442114083333 536 +9121949682662027269 541 +5375060563545179653 546 +11461944020052039682 551 +10205876604940857353 553 +17856338086929782276 562 +3964733248608209412 566 +15252617693956101123 570 +5198588053258159617 573 +7294352613378259976 574 +14274593384918848004 582 +12443356879762990084 586 +15967601366558600195 590 +0 593 +1596502676746638348 593 +3447763432008799745 605 +2154246728958848517 606 +1249748142575979010 611 +12802117032328183298 613 +14720455521613154825 615 +14431397366571454983 624 +8968154969419252739 631 +61922506310515202 634 +17332184019644205571 636 +1580044533016865796 639 +0 643 +16037339623732295172 643 +0 647 +6451385579602643969 647 +2249232807147062791 648 +15969372656029624833 655 +9184080755936318981 656 +10444965622910510594 661 +976846907109217284 663 +15036566770534162954 667 +2852219209756952581 677 +14428186506827194885 682 +0 687 +9583345567128655877 687 +8154021185610424842 692 +7639653587249864197 702 +284400846134645765 707 +5822594207495943172 712 +4666916656146452484 716 +10837424823999667726 720 +7662230599689246212 734 +16769958284715374596 738 +14214321919518354947 742 +7700892892210644993 745 +5647486165416790024 746 +12807160877623480835 754 +17202327424132939777 757 +5849043248643779075 758 +18232796011600235523 761 +4957062118189902859 764 +6105730765254667266 775 +8753292226633308675 777 +14066686889142136835 780 +1047708050925830148 783 +5555751253338228747 787 +8205438979066793987 798 +10100035083082646017 801 +3037731532850264067 802 +16470238215781450756 805 +15841867742103541257 809 +8087512074161331714 818 +15493250668750321668 820 +3797087601271950854 824 +2623502875154101252 830 +15159098560356506121 834 +343051006899596292 843 +16668194639613285891 847 +0 850 +9601059867653113858 850 +1570493927206813191 852 +9118300038493915138 859 +9563382677447647747 861 +5285530497249013763 864 +14598000812816350721 867 +15243372398425255435 868 +9815541045508240385 879 +408899826773384197 880 +7463961818871554565 885 +12980371725716597249 890 +15376403281856848903 891 +0 898 +5841652391326774789 898 +6476912065420260354 903 +3963854010828661252 905 +5784218172655345161 909 +15327721657175197701 918 +13180549833166182403 923 +15904501101973266436 926 +0 930 +14206180323061139974 930 +1106786522797875713 936 +17058832169116321282 937 +721828206256696835 939 +0 942 +8561789411832569355 942 +13374043249168898050 953 +15922789491870388229 955 +0 960 +16131878595889026564 960 +5509499768642979336 964 +12415614990376579585 972 +11304605070154481157 973 +7663245502729528834 978 +2692663086158549507 980 +14133757573751133701 983 +6813598296480126979 988 +13616528755765764611 991 +16303994430841145861 994 +12880492472155407874 999 +14023778603465187338 1001 +1658551813664662018 1011 +8148008758896362498 1013 +10688946549204321795 1015 +13274653424094307841 1018 +10847911221158770190 1019 +0 1033 +4643539771717744131 1033 +4169507947260962821 1036 +3126526255358650372 1041 +13449815687571241992 1045 +9421207081901200898 1053 +6898163624184020997 1055 +7290174431607841794 1060 +2741902156609523715 1062 +15499057183587255302 1065 +16461426401301993476 1071 +11278211202787295747 1075 +0 1078 +9413985875830324739 1078 +4646548733144616463 1081 +7078801759685020673 1096 +5376123263925219331 1097 +14227335667134915589 1100 +0 1105 +7295351152600562699 1105 +0 1116 +1397641409882635269 1116 +2364632016557825025 1121 +7290779788839345158 1122 +223977268476071945 1128 +13026660262516529667 1137 +17998435953459809796 1140 +8522469059272339460 1144 +16293947433309880833 1148 +4576500186674335749 1149 +0 1154 +4042247147937702403 1154 +3034443556411821057 1157 +13667368622259281923 1158 +15202537810082257934 1161 +15337640185400698372 1175 +8308041085868251649 1179 +8832030889396702722 1180 +10436989792260434949 1182 +14898581533124037641 1187 +9317528159836099585 1196 +1612938252083390982 1197 +6278485319310800898 1203 +10612805446261845508 1205 +13787162434835940874 1209 +12133705386992745478 1219 +5227473436681376774 1225 +5656787771058157057 1231 +4433258109319585794 1232 +6704526927800668169 1234 +17440456789764264962 1243 +6979104089888754689 1245 +10768049747876580866 1246 +15707303682313568257 1248 +15148244407999994380 1249 +2841265161354426373 1261 +5252307512862989316 1266 +13331565891980378113 1270 +18159416118263116290 1271 +501516395825858060 1273 +3867012501081805829 1285 +8267472486312505860 1290 +12872828689431491073 1294 +727773195231890946 1295 +7322382021491738631 1297 +5402024496579473921 1304 +6959655625064837122 1305 +10187142685062514177 1307 +3029360479097259523 1308 +3524388403479357447 1311 +5803404108302127107 1318 +3322880653425492483 1321 +14014789072627667972 1324 +0 1328 +17779075582177396743 1328 +11597164340541700097 1335 +18164718194518923266 1336 +0 1338 +3688441162538457604 1338 +12763684824056344584 1342 +6555198237040291843 1350 +8999497138912988675 1353 +9277828726380557826 1356 +1652226711750231042 1358 +6386464493042135559 1360 +11832103051565904386 1367 +7889400420599073793 1369 +5173699340624307713 1370 +9839391635984425985 1371 +9179189546563518985 1372 +8987610858276033026 1381 +14211262843725043205 1383 +9924217736728436740 1388 +4401850895204555779 1392 +5541709837691148811 1395 +10214740045672277507 1406 +14656675767246138369 1409 +5518164076312088578 1410 +8819194535554354691 1412 +1202694809888231436 1415 +9937648736864647683 1427 +4776509399304216066 1430 +3828150896429232641 1432 +9726415758235178498 1433 +15478358790166008844 1435 +0 1447 +447632828248568324 1447 +10254625284015096321 1451 +9602208154038649858 1452 +7918490636759656966 1454 +4464032935723660291 1460 +517803065456797188 1463 +11296051306811729413 1467 +9559870439106258948 1472 +18140734313948729864 1476 +5761393475703308289 1484 +5817187969532432391 1485 +7214411138154648580 1492 +8556555308704695297 1496 +5517275039512219661 1497 +155198283803470849 1510 +12028807386786979841 1511 +9402878779861331461 1512 +7529466829850301953 1517 +3700043109242268166 1518 +7889220073888590849 1524 +9698905706548099588 1525 +950350740255051780 1529 +16659267722661032455 1533 +11934825441675277832 1540 +1840952787151591937 1548 +3181706929772123141 1549 +13084360636440561667 1554 +7392348362663288323 1557 +11299566685738323463 1560 +11865504406956790788 1567 +470806909387516931 1571 +11392390055026286594 1574 +0 1576 +15250035972710306824 1576 +1841748561073501700 1584 +13959366503388518404 1588 +16383575845586120707 1592 +5993903773214649347 1595 +12927537188954086928 1598 +6310676060569643522 1614 +6823572598110530053 1616 +0 1621 +10355215107753852930 1621 +12991560131813107723 1623 +6463225875312731650 1634 +444925180768886788 1636 +8287375501749122564 1640 +8102699978355624961 1644 +3217121844483982342 1645 +0 1651 +15310893597687290371 1651 +4651888484278436356 1654 +16622466823413339137 1658 +14426029300798547465 1659 +16208338759425902084 1668 +13384891560853317123 1672 +10542264124115582467 1675 +0 1678 +13404868863569442317 1678 +8380728838811013123 1691 +2656782871938641923 1694 +5621105522992570375 1697 +16165957063051496962 1704 +17183335989224497157 1706 +0 1711 +12377944724210268163 1711 +15698714840429098497 1714 +2063306500131813891 1715 +7135499884796623879 1718 +14916197160702468612 1725 +14565364212611500547 1729 +17109666354199615491 1732 +18420265465448709122 1735 +5039636110599831051 1737 +13648715526743665665 1748 +8648155745742680580 1749 +0 1753 +4128476852805537282 1753 +12229435493123252233 1755 +18671114624524289 1764 +0 1765 +4330985506003776003 1765 +4960636854468069379 1768 +2825174586054641673 1771 +8083214972260871169 1780 +1656668836635006471 1781 +15658718806708214274 1788 +1364137667359422465 1790 +5440910769879224326 1791 +1242060995600047617 1797 +6028285323527704577 1798 +9862524515548398083 1799 +14095132043223516673 1802 +5330121798209797643 1803 +3047808178481674242 1814 +7009881287782938629 1816 +3836453927748870146 1821 +4828562734878493698 1823 +6251707885160171534 1825 +13503013357676597250 1839 +13120060435028427777 1841 +17453157023102628866 1842 +6659266074333195266 1844 +12122449770852231175 1846 +76872493233309186 1853 +10510620038219076100 1855 +3104474465142299652 1859 +15145875800387371010 1863 +14514645157364972555 1865 +5990940750853294082 1876 +9568631318395414530 1878 +13307393937882497539 1880 +0 1883 +13432428898749511691 1883 +2851874300532727813 1894 +16127254686981486084 1899 +11152828733555106817 1903 +8099684063905722369 1904 +10726727557015251463 1905 +0 1912 +16773004137299201537 1912 +0 1913 +1737396243104320517 1913 +12312810570815952904 1918 +8420117868402509825 1926 +4468099455608655362 1927 +17181412210024682497 1929 +7344171998747088899 1930 +11200240032637073926 1933 +9773885730549905922 1939 +2888420847349521921 1941 +0 1942 +3301971714535044611 1942 +6622000068430301708 1945 +14679279568503564291 1957 +15312513401406547971 1960 +11219696574507219971 1963 +15557068645919193090 1966 +14518831268196627465 1968 +11306244334020066818 1977 +445302382600591361 1979 +4798518764725378563 1980 +12833053520101596161 1983 +6569110733351726088 1984 +1133142439547627010 1992 +6020738327851480577 1994 +0 1995 +0 1995 +15123217074875560455 1995 +5146261845254048769 2002 +15577303646915962882 2003 +5068854713026915334 2005 +5662217880612308482 2011 +13584286678752042508 2013 +17647669975855288324 2025 +7092182408195844613 2029 +5243600304614296065 2034 +16379641210199802883 2035 +6541142296931350023 2038 +17648968980389751301 2045 +3633167252938199556 2050 +691728008305302531 2054 +7434042972483105284 2057 +1243474674683616271 2061 +439217426838173186 2076 +10460352595647090183 2078 +5080394082232633345 2085 +7346464481151790597 2086 +8068677175549539843 2091 +4859996294860352513 2094 +12470823893961605122 2095 +10033529424736163842 2097 +10769920382809060357 2099 +16128670331104411146 2104 +2973668094989328385 2114 +16323032859702780931 2115 +12227727930958763521 2118 +7302528030871866371 2119 +8967586997946816013 2122 +13935701471042006020 2135 +15676859696752227844 2139 +0 2143 +2397906929972799494 2143 +731429270944234509 2149 +14629591375919925252 2162 +14201687141277194244 2166 +8813493889730974725 2170 +4967156306307785221 2175 +12152782138863493635 2180 +5716269545878689795 2183 +12118250850399448070 2186 +10079764034817249795 2192 +9905170822798166018 2195 +7330246949116896272 2197 +4975588281894977539 2213 +2377967791858227715 2216 +1711948357573607427 2219 +15733402191778006532 2222 +13617127880905861132 2226 +5413022680339381252 2238 +12001217113207191043 2242 +605362804928389124 2245 +10888521749365150723 2249 +11742554576381655052 2252 +3591551764774430724 2264 +8647496912976230402 2268 +3843626828621262342 2270 +3921763517492323331 2276 +7707493410895858692 2279 +3920334550068498946 2283 +2658528064200329217 2285 +9038122947820533253 2286 +6952499746958836740 2291 +7951530266135717388 2295 +16076637508890388481 2307 +15187897527562671106 2308 +5520701509759360003 2310 +2598679891400145409 2313 +17512255026679867408 2314 +10995766946592999425 2330 +18117038245928559618 2331 +5391766950501834244 2333 +14461374868186265605 2337 +1273598128050393611 2342 +11820949665032480260 2353 +17841646829021216260 2357 +10200569215461547521 2361 +3670141860910412289 2362 +18396940417538187269 2363 +14261984156631670787 2368 +106960762513502723 2371 +16393357936187300353 2374 +7032931990465729538 2375 +15907195827890083338 2377 +16437195285078765571 2387 +17301257309241798147 2390 +8236593629924756481 2393 +1379157623727557125 2394 +14767417508072398345 2399 +16695407490005887489 2408 +1414009372711604744 2409 +499004129948061185 2417 +5775255721778604547 2418 +16754393591199635469 2421 +10568987941526160386 2434 +3311623553148127749 2436 +10255724520964794369 2441 +3121950734017230849 2442 +2129428121322164230 2443 +5233872436075409922 2449 +5115946926893418500 2451 +298818270766586369 2455 +2534391384903305218 2456 +13962240998865999372 2458 +2858192092257344002 2470 +2246014736733727747 2472 +18208224108542041605 2475 +5900635063125726209 2480 +8459478259862856201 2481 +3106812066263162882 2490 +6016756381746226178 2492 +375597697640802819 2494 +2513762961093744131 2497 +15366269329105501700 2500 +10035949288505144322 2504 +427851159373997574 2506 +4274431321888115714 2512 +5253654952100000770 2514 +16894221500064376839 2516 +14687193167626954754 2523 +13771965837935513090 2525 +8874009193925074945 2527 +4974093839237721093 2528 +741620693598341642 2533 +11991618038806280705 2543 +11116672093208526850 2544 +15807249887587362818 2546 +7323942637968351746 2548 +3660270925885407751 2550 +0 2557 +10684033640943126020 2557 +16989816981004759553 2561 +9001924880900419075 2562 +1998443310251235851 2565 +17567979874939109890 2576 +13652482471668535812 2578 +17509569230481751555 2582 +3182500785161561606 2585 +13325982159032983558 2591 +1923914978402147329 2597 +5589189981371284484 2598 +1161601912578541572 2602 +1916235467976744451 2606 +16280831412119656968 2609 +5531274414859838467 2617 +13599333592024061957 2620 +17989155199582565378 2625 +3030922814179764740 2627 +14644007464957335564 2631 +0 2643 +5497605392959732225 2643 +2032331457863458818 2644 +8100338548587463682 2646 +993329328502006794 2648 +6750921732510502913 2658 +13748899324120622595 2659 +15617703054210413571 2662 +13138109094843573761 2665 +6544485718564688390 2666 +4168731610225209858 2672 +7315066071491735044 2674 +11306658702491732995 2678 +1460741416990041090 2681 +8624484085251326469 2683 +4952143576172173826 2688 +11470130411385533445 2690 +8808161070990530055 2695 +3407659004810532870 2702 +9761503347061253645 2708 +347929962150473217 2721 +15682869073661250565 2722 +12636859761190001153 2727 +2169559175677957635 2728 +6583723534446631435 2731 +11332478688871909892 2742 +3541912969021597188 2746 +15665073567582359041 2750 +6811971824255872515 2751 +17832657550632072714 2754 +8908928359280249862 2764 +16149194899805562374 2770 +16584564148406323202 2776 +8926638669588577795 2778 +8056234806465729542 2781 +20557314279745028 2787 +1574148835258315780 2791 +0 2795 +5593745704266732037 2795 +8450014032945361420 2800 +7024373575570305540 2812 +11737655816003366406 2816 +4727037432569372673 2822 +8600949146786643459 2823 +9003058529087846919 2826 +14052664559056898 2833 +1424791599736305667 2835 +5413427196124183555 2838 +13050600684981920260 2841 +8589685071512056331 2845 +13186761374251900929 2856 +14090913721681066498 2857 +0 2859 +2742241767433926657 2859 +6309431184810384395 2860 +16867533333923942913 2871 +555261403132789763 2872 +5659601479152637444 2875 +18276768397881284098 2879 +6852010445819064844 2881 +16631838326863331329 2893 +246764640492975110 2894 +1313867708490425347 2900 +8944238870676823556 2903 +1060867472129666057 2907 +16635885715046522883 2916 +13334184179287121921 2919 +1341139991463623173 2920 +0 2925 +6310211216600221189 2925 +3521973268169620995 2930 +1462184866304097281 2933 +8359017763585949185 2934 +14138351761235446785 2935 +6817592922583008262 2936 +0 2942 +6385096150346020868 2942 +0 2946 +5484657660585723395 2946 +10615912620259059212 2949 +11956475177743584771 2961 +14617995947569946629 2964 +16460942815259223553 2969 +9814422111234662404 2970 +4608931955518876683 2974 +8617716815688349187 2985 +17740454941921826819 2988 +0 2991 +10586556775954286081 2991 +11028786367153901576 2992 +7561184979369551368 3000 +10180555287637633027 3008 +262376940139235842 3011 +1252244297117510657 3013 +17286434400127825418 3014 +11940732067173687811 3024 +9446744360256471555 3027 +583923543216445954 3030 +8153426984110241281 3032 +8998238685693393417 3033 +11022193474305971204 3042 +18018779292443289604 3046 +13782486654821986817 3050 +1031535266324627457 3051 +17367371162468022278 3052 +16063095350159409665 3058 +16006913374966627331 3059 +0 3062 +317830424679224322 3062 +14882116247225631239 3064 +9977848214775454210 3071 +15016859152309685763 3073 +1451917599200393219 3076 +14163345466838668289 3079 +7124786413716748809 3080 +8972415547684808706 3089 +17905923295565835779 3091 +11508735911159903238 3094 +1060738927182784515 3100 +3235164743035444235 3103 +7249634886133244929 3114 +13627026919527422469 3115 +804144428748921345 3120 +4260278694170215937 3121 +2554890109424057864 3122 +0 3130 +2939022249034957313 3130 +3727916159743203841 3131 +14170274700031256577 3132 +7153627445263524879 3133 +6798175517396767234 3148 +1899052595905691141 3150 +4651137331222245891 3155 +14020723224952528387 3158 +5768869715157669895 3161 +13394211108659571714 3168 +15788932119193980932 3170 +13584005658508513793 3174 +9286626632069867523 3175 +2398026920081879562 3178 +1285989134179298818 3188 +9371873775174273029 3190 +18182246561705410049 3195 +3627164815665507843 3196 +18002283031389555722 3199 +13723140536667785217 3209 +11940684153082156547 3210 +16151440538186193925 3213 +13475891972713434115 3218 +5932226594251481096 3221 +15508203776273810434 3229 +13958242421862434307 3231 +2178759546197172739 3234 +12536204645038731778 3237 +14021691565090239498 3239 +0 3249 +18424936840617633797 3249 +9515558058741110274 3254 +14427656809453646337 3256 +15295479713001905676 3257 +6924455800485778945 3269 +5547275743159208965 3270 +15965423529103676930 3275 +6276065480049782274 3277 +923852355669990415 3279 +5171389834127005698 3294 +15756927494767584258 3296 +5380717287071449607 3298 +6048706605171842052 3305 +10493631130929582093 3309 +2792686703001238018 3322 +16318095573166788102 3324 +14961739739381704706 3330 +13885085964549002242 3332 +8803999472247604229 3334 +13681809489997040642 3339 +1274343414475602434 3341 +17525390131260455942 3343 +4637625228183366658 3349 +8313154017818126861 3351 +13090076428282480132 3364 +18133227728108545 3368 +8282473413611347970 3369 +107193099920609282 3371 +8505179371271580173 3373 +11102079825957593602 3386 +10212767298703785475 3388 +5215453497761775618 3391 +3298152084179375111 3393 +1095163960428030473 3400 +16887781145875813889 3409 +14786085928210816520 3410 +8581278387803219458 3418 +6241337607249230852 3420 +9254719800476612099 3424 +2568855290428722689 3427 +1289519920250085381 3428 +14618186241114017793 3433 +9612541243912769538 3434 +13926515287424429066 3436 +11093957915681312769 3446 +12010544601346956290 3447 +11839562359654205442 3449 +6839541636025740804 3451 +6012482217637302795 3455 +0 3466 +5775335776577318914 3466 +2685494297938271233 3468 +18186802079969910787 3469 +3127521196291951624 3472 +6934893239724900866 3480 +11630798772510404609 3482 +2767762624498050052 3483 +14135084772626181124 3487 +11643008759045397001 3491 3500 -5303047073946667464 +14107087915135404740 +3545799512027105927 +32996413518841137 +15568274631689570656 +20587511236070012 +2390363305266056430 +3863606920688567965 210658854139 +9870724567599405 +103154228 +3007753865419557454 493093586 -15289397349632312454 -5941764183477191834 -3477193953305167424 -236453760381 -7470284155521404014 -24445261 -16426766960960540026 -14549236 -817365937 +814220189 +538968856 +45810044 +11403474 +2625321602296383846 +3076135121411313050 +16635669954819197974 +5514354727165429372 +18413391979390173264 +3544953467117898450 +6361518319333476776 +5833854247140395797 +518849275 +2752627 +71565807 +9870724570416301 +163316374 +60096910 +817038254 +18411417877468545037 +5993603989931887912 +1873618523431177265 +14787093348585572176 +18413109988782047308 +1283692271244348427 +17461812017531651650 +13165236096819726043 +14883032307819284131 +2789363538679106064 +11161692095903435283 +62914993 +2365498112266798670 +154665586 +13726068529822894439 +5570718 +544604964 +33560368941433940 +819856323 +1873618458944931675 +1873618489039064439 +6156738032733324876 +10259573046193883986 +6208295848581203181 +5991347927496394467 +2272905061487347697 +8972557000702888938 +15289397384024950845 +4767727591019973374 +10758418391935812957 +2292825785040636736 +1545208828 +219257441372 +5569296714050766113 +2207492642904016905 +12612941966326959190 +12426051065400527122 +18331556280207363 +2785415334835848520 +6156737968247080128 +15292217517958891614 +5780604328577598853 +3188833133853148985 +4078298757842341053 +6051485356288903427 +573178715 +102957618 +91488775 +2625321602296187261 +114426460 +22675774 +11206864 +9870724567402585 +5406444726343502428 +68551110 +515834601 +2431124533 +538772246 +11065179658016983681 +8930986418384079868 +4076606646528706921 1873618471841499416 -71893492 -10694515171064744788 -29330183088506125 -61997475 -4653200 -109445719 -8926052536804313893 -7528330190111771360 -1418462186 -5887104182899575287 -2625321597997091447 -23407864425745813 -1647838213 -6152225753094686522 -14151987057237756511 -18058417591402760409 -538510099 -17855463731522440261 -240752528220 -27920040887059601 -11078361536363433136 -12517601 -15885957841278600403 -518718202 -805438326 -2621553 -1550910461 -2411070513 -59965836 -13012951802392676509 -97518103 -2625321602295859611 -30277976 -546374457 +3701601059573925529 +16166203682344470241 +6101795981361546864 +15289397371128186695 +7569568047215545466 +18411981910273949729 16759426304739641933 -259654328 -27356063970624739 -1873618458944931675 -6209987959894902621 -5728764444739437994 -18413109988782047308 -13885455448020813663 +48431492 +24535874148371011 +14024943 +59900299 +105775699 +10770155859627543824 +71369196 +9870724570219682 +163119765 +2530739313276357975 +5052785364214352114 +805372789 +5652457623480305518 +644809585 +816841645 +2556016 +4501477955215362649 +4502324021619918399 +2150364451440035988 +6156455943246842659 +1873618497637649718 +12309852946450942075 +3660444556051220001 +11103300151687644832 +8714520725396523830 +5461104765611607541 +27356033875641745 +5352348805862394041 +2012415014 +5151629580948802356 +5374107 +154468975 +108593749 +62718382 +16843031 +28311895 +1107456968073808590 +11490081257974859839 +16633695840000739887 +9386257335747873389 +4959080478982475006 +11408348231855703653 13464164481390611573 -5514354709969504081 -6364097374632348674 -2676033351739376985 -1136798196293306910 -5299098874403555921 -2120987217453057458 -17306856587979066781 -1873618532028844481 -5572365145471912335 -18412263926676652075 -105382480 -5303047039553965447 -9881712940254169714 -152830562 -8610102806501591788 -15524263781940136850 -14282671233461718187 -2857298572705729021 -29330122900898936 -10554335258691243263 -8453377129057749572 -18411417864571256842 -811271050 -1873618489038604579 -4657106642463886071 -2676033356038145381 -514654951 -10757572347027851837 -4237766514325588729 -571999061 -9821766011288487605 -7230168968130792223 -2704904949959166469 -1823671323 -103350839 -46006654 -2755882956846859930 -15289397371128186695 -12662636664722033563 -16318735 -18411417894664929297 -5462796894122411284 -9950019064427710530 -6981729909914862956 -1992588707391932346 -63766972 -6422699 -23407808536904833 -15394822466617412826 -16881139139804531782 -14312300901618944289 -2625321593698061230 -9870724570679212 -5780604289886653255 -3870997034531752803 -2531021389865944442 -10908568553618343357 -1860700038481053299 -196215461 -1801847830 -24183115 -18424247431471827427 -14287090 -417019855960 -71631344 -4391052 -61735328 -18413674012989259870 -2625321597996829544 -17957750408840481687 -9870724568648556 -41943405 -2789363542978135882 -18412827950883864637 -548143940 -22151483 -17257283845880874759 -899112529018292807 -538247952 -69599701 -8510664359869943178 -27356081165698156 -27638084672359236 -12255453 -11400819049620310987 -1321272283 -16881139122607162703 -2359405 -3101815889301670444 -518456056 -9232147856523987724 -3758799212073651272 -3591160524196219107 -154600049 -17946608694533885076 -11500631658516907905 -825323275339564903 -9870724566615620 -39911783 -12318365723907459763 -546112310 -18412827980977537092 -536216330 -2676033351739114988 -11069796553860646809 -7880043043777809442 -451412296787 -18411981918872141859 -11678577273375754735 -8856014234050823647 -105120332 -1309344723 -162464400 -681145240220010584 -2626514825137096412 -6589396841525218018 -356832249381 -6156738032733324876 -11202456151687629452 -27638041680086900 -11243723090649876783 -5726358144768542273 -12498251711624252784 -13702827714901707594 -811008904 +15494005608834598990 +1407386597 8192198 -8714520725396523830 -514392806 -9960543895307946415 -15287141235608259625 -5727354401416546168 +219257244681 +42598769 +811008904 +2573543610120276856 +5356297048398365877 +7595953279435999504 +5726226297114658480 +2723374776553770162 +1543385872365455415 +11535686880442518166 +15289397379726773461 +5565348488711963913 +504169174 +9870724567205432 +14212253575230457510 +5831598111619679502 +2625321602295990612 +572982104 +813826970 +279448324634 +538575636 +11010253 +68354499 +11243723090649876783 +18331491793766525 +15292781563660995825 +5991347884505304103 +9409295256684857617 +3759645248384009814 +5832726134240118664 +14312300901618944289 +20305615210743190 +13001845694847518363 +2652485274356286816 +6151097653090126690 +2203332276215481610 +18412545964574834746 1808894516123993997 -3686437022462641529 +518456056 +2359405 +1321272283 +71172585 +417019398489 +18895516000586505 +162923155 +9870724570023121 +13828334 +2625321864544389907 +816645035 +8453377129057749572 +11949535972653271176 +1873618467543321286 5249797181178709209 -2625321589399030850 -103088691 -3062219857732765097 -830399540494469985 -530117487457144076 -12454108019635062383 -197984938 -8930986418384079868 -818873277 -16056587 -11526999220155450649 -6160551 -63504826 -7621890105505615217 -11847668763332905754 -10377426660276898779 -1873618519132015281 -18092519415945890646 -15882855708139391266 -7993599274919922706 -2789363538679106064 -2150364451440035988 -9870724570416301 -2625321593697799226 -91161094 -1410073577 -23920969 -7513578521803359945 -22279798815198594 -15520597512816297356 -1023125932615797552 -540017436 -8910392170935354895 -195953314 -644809585 -14024943 -71369196 -1873618476141774348 -816841645 -10906583479868327250 -1454041666728626384 -4128904 -18413392005184749654 -108921430 -468609401971 -16204201012116260706 -99025451 -9870724568385196 -18412545943079354421 -11878630053446878902 +5567604589840172352 +3707523343842937215 +17088205463377873568 +2169005683868174908 +9568723490388248888 +6103488088376871190 +4025969582498383295 +62521771 +18276979644936029994 +154272366 +16646420 +544211744 +28766107292140894 +5177496 +509805280 +1873618519132801026 +1873618544926132491 +7676326001635166459 +7676326031729298383 +869984510486186619 +13146357072728951328 +2000487899013646903 +2449021711964768402 +6155298010574883251 +6098975770044401989 +3189961199463959445 +2676033351739376985 +7995587 +19464489 +547029825 +219257046468 +2021331689141374237 +15288269301218674108 +11705421198335413148 +2508194873 +2625321610894575340 +6097847713031849822 +16064731596255856452 +13701595356116683915 +6364097396127827248 +18413391987988365394 +16364556117061994922 +10296839827164892306 +5403008449516603011 +15858116883009440274 +5833854255738587405 +45220217 +194314911 +10813643 +68157888 +56689033 +114033243 +4287350266942457603 +987047180239768912 +813630359 +18411417886066737167 +18413109997380239438 +11548493110908749415 +6364097387529046615 +5561348123192067576 +5835546388547569431 +5246976935469649046 +13884327378110449525 18204249488608200784 -5566476545725367766 -17951898368652543383 -7558005371879033601 -16542141154387102177 -6316393479032998553 -11694336983993944146 -11427331956784106382 -4662073785906890031 -1873618454645640429 -537985804 -12999620585941961275 -2295119206548507606 -11993306 -1597536180772867045 -5299098844309358384 -8294669686619703163 -69337553 -1873618506235448739 -518193910 -5406444726343502428 -16765215479188031591 -5460499803636172954 -3431717683755289915 -28202117477106938 -5249797172580910311 -5745384143842643344 -14065038233622153931 -14311172801615955497 -16758489844492275047 -5510538272098551989 -11065487220741573048 -9870724566353399 -5679882735784101879 -259130038 -87097857 -3491703471172619422 -545850164 -18271599167641487963 -5991347923196709309 -1873618458944406678 -7033448275620070919 -812778389 -434977997061097911 -3445982126355516078 -2676033351738852867 -3545799512027105927 -1873618484739311861 -12749251354825264418 -14836382508930370955 -2625321585100000596 -21997756618246082 -8716776809328151764 -15580874176502892132 -3332575624131774585 -4445946672738010859 -5780604328577598853 -2848264744227112681 -1873618441749072804 -257098416 -4930631980557601532 -6877319166685482198 -1005889956380019628 -820642761 -17826079 -23125779236849772 -810746758 -7930050 -8929320279979198383 -9654763076979264499 -11949535972653271176 -1873618514832984063 -514130660 -18066207382028748450 -2573543666009114673 -18613585580197092 -1427238547443354327 -2625321589398768544 -102826544 -5903884228619468800 -4279043148 -7036226112429884975 -818611132 -15794439 -3324580943442478547 -1903640920853056624 -5898403 -1873618497637649718 -1133620887485417426 -10156853965084755435 -63242678 -282723005 -13586095437453200186 -9082058141968173941 -1987794462939089941 -13237708531286474753 -5240852582657493474 -1915314009235720841 -9870724570154139 -90898949 -17090754651615726815 -492307151 -195691169 -11050161621988804687 -23658823 -11623400942792738969 -9304480456320748248 -71107048 -816579498 -23971751058934778 -17869638717220195611 -1873618476141513316 -361675971417279818 -61211034 -1873618501936418049 -3866756 -567411536 -5302201063430292982 -8486888319115725460 -12406930521299355297 -9870724568123690 -11034422950646711803 -4287350254045103750 -5566476545725106758 -1923875870 -547619651 -6366353527348595732 +70975974 +9870724569826462 +816448424 +4211213383 +2162794 +12974919760129952993 +105382480 +5459976661309982295 +21433723812579518 +32432320527074663 +1873618497637255436 +9305858029919208637 +10225919154718574351 8597156797828894009 -13590665243542948895 -13237708561380147208 -4254959725487523541 -2907303882175415846 -1873618454645376983 -9230753948926543533 -11731158 -527827717 -5511666307614640107 -1330643932 -69075405 -28202091681942395 -4727296740454696303 -1992881785902860007 -18301216972081072101 -4076606659425995504 -9870724566091296 +12461042340477994821 +1455946274504313841 +9538952396691934382 +927164962728314711 +5782296426993943791 +9714916684781063078 +16449809 +4980885 +819266496 +2625321589399030850 +10907429529076434052 +257295025 39387493 154075756 -5459976644113468289 -545588016 -12461042340477994821 -223556406340 -32432337723721245 -19595563 -2573543610120276856 -24535874149025753 -5196265237615086368 +62325160 +1495925747 +288043895627 +4504298205224635444 +14835085562484362568 +16881139122607162703 +1839046019115124804 +11923578915473263059 +9388513449772451585 +5247593352907982888 +5153885686374731086 +12020808312486431384 +14848239906707278405 +5405598728725530322 +3653991426073234491 +5566476498435442740 +4333982245204396969 +17007720368052373541 +14458654042895551171 +16885259953617962521 +2676033351739180486 +6877309693745106245 +21997713627284659 +7562235540534921217 +2625321610894378836 +5458848587099997499 +1647838213 +288046714075 +1454859013759438228 +1133620887485417426 +237175467 +810615685 +1418462186 +12162857194684744950 +88080898 +19267879 +7798976 +546833214 +6206321690771522709 +21433680821684597 +1873618480439692390 +3932922014897081298 +2549492329236335496 +5249797112394286460 +12294570438877711433 +2324121364801391676 +3315661715940248009 +8971880411373045432 +5461104782808583112 +18411981918872141859 +15371922320578972378 +361675971417279818 +90898949 +13390152586296232130 +492307151 +13522668389390157414 +538182415 +10617033 +12498251711624252784 +22085946 +1987794462939089941 +425617786716 +1730937871 +5356297014005859746 +5569296739846327213 +16881139139804531782 +4196703391028741586 +1873618476141710425 +821147663836514852 +3158171969379764633 +30176223702288623 17735566651085687884 -6204347601746593065 -1873618484739049815 -812516243 -6152225714402428442 -15291935501556190620 -15505670362359531298 -451411772583 -9484411285755463284 -161940107 -15292499508566297469 -563348302 -506004186 -11238431078799509026 -18323667541285735009 -2625321610894640833 -103179363763488430 -503001580666 -12769025487284210679 -17785259844527786731 -29612147900877606 -15290243377345399572 -17563932 -7667902 -3186488476490139978 -810484612 -1192315333980326167 -1873618514832721746 -15292499491370961900 -513868514 -5347351719937377689 -45220217 -11775490430040476325 -12240192446106372977 -35324256 -2396555433535145871 -7409502855497715015 -7888341864134085054 -4278781002 -1732546121802517809 -2374936041605498895 -21433680820701635 -12189960762281954023 -869984510486186619 -3598203394278688718 -6103488079777762245 -72876542 -16990917635978692369 -818348984 -15532291 -1146796961722731823 -17761874897365304540 -62980530 -4534407021717882867 -5636255 -32714379920409891 -12552846396214610071 -6262673798361580735 -2528483177756102046 -9870724569894177 -9297735470756268616 -5831598115918776853 -32432303331018178 -6064762127302393958 -6156455943246842659 -23396678 -13500652 -16916327697533962956 -70844900 -816317351 -18411699885273055253 -5884848047378859255 -5837238405281154301 -14311736903207619026 -5141736951422061236 -3604608 -31022281504523376 -3599049409094225259 -577045344 -2974323816123992770 -8021450341214588326 -3577503648415550265 -509805280 -9870724567861628 -11098517635487303139 -7462549834646555859 -98501157 -5779476207078475458 -219257375260 -490013379 -4222974949961697922 +1427238547443354327 +10223260478367337870 +10720606758114626648 +70779363 +105185869 +162529937 +9870724569630759 +24904017 +2681814701524780811 +1320879066 +1584661506 +644219759 +13435115 +6097847786116483627 +12477949191893683608 +6925759835249836137 +27920040887322186 +10003084053115964048 +16253198 +153879145 +2625321589398833886 +257098416 +4784274 +9103100569952650951 +12474564753552836994 +1495729137 +62128549 +9774054990929462949 +5356296971014964874 +6153353870293665804 +9568883447315500158 +1915314009235720841 +16655465042802838677 +14866462842593414402 +2676033351738984017 +546636604 +535167753 +42008942 +30540122 +6365225483234117329 +7602365 +282854078 +2625321610894182276 +13307798926833551183 +10913926882465549337 +15906307047154976446 +6104586261131037638 +8483828720841721486 +15287423226215073909 +17785259896117529586 +2785415278947600352 +9000175594581527004 +14425661002709010016 +5513226652957347114 +805679481429165719 +17859691850682797212 +9181555677596944971 +1363739614 +9870724566615620 +537985804 +572392279 +15175534989820758889 +1873618476141513316 +2152780467316001469 +12601357272775920269 +16765215479188031591 +6534429686359852912 6366353553143235674 -3158171969379764633 -21365044 -27638058876667848 -29330140097217635 -1873618454645114642 -2703776923039566000 -68813257 -279448782049 -814285726 -12237654319976351671 -517669620 -5779476284463187670 -10375505326587315831 -18411699915366727708 -6205475624366966000 -3307734082 -39125348 -1087507565178193378 -545325868 -15986098390340470919 -223556143025 -19177592590632702 -8865366478519731984 -19333416 -32432337723461001 -812254097 -11305519054433421356 -1873618484738787248 -5105416417023100899 -572982104 -505742040 -563086155 -104333894 -8070528080642443989 -11327137566841769230 -2625321610894378836 -16377260960560187819 -15586729198848181726 -1873618441748546884 -18413109971585663048 -4825924017323379312 -5915592292141435844 +12689613402799605860 +9138963602338286574 +104989258 +644023149 +361131345578 +816055205 +9870724569433729 +70582752 +1309213649 +17634738504986593825 +5639662680184522626 +6316393479032998553 +16340493341965880015 +5344573059048999857 +34124461934314600 +5994450030541998229 +2625321589398637514 +2676819007 +15515140772745448064 +498702419026 +227855238971 +4587663 +16893851890367073119 +14264208198271043974 +555090760 +818873277 +61931938 +16056587 +8821966780582857359 +18411699885273055253 +4861149623842704773 +18413391996586557524 +18115578910873816258 5832726151436896491 -17247780946628644032 +365262179507571896 +16896582888638318388 +4445946672738929841 +17186370630874106258 810222466 7405754 -11549275701007551889 -10161648502327149991 -570950482 -1873618514832459339 -313841222762 -4452458274095237609 -1445774942907271091 -6101795934071424788 -92406286 -5293539447540681024 -18331491793766525 -197198505 -11199980773228349986 -32432320526091507 -818086838 -1997667722089860216 -2524806027085153844 -1964966944 -15270143 -1370042529145686776 -5565348523104797810 -18331539082773742 -62718382 -2012415014 -18413110001679335503 -5374107 -14282027259104724924 -10375505339483621145 -9887461037680036022 -1873618544926132491 -4662355883991631380 -18412263939573940270 -157614716 -3295137431799204142 -9870724569630759 -491782859 -214958343888 -16875205763331852041 -7241607903360452069 -5408471212899110030 -23134531 -18411417877468545037 -27356081166681957 -644023149 -70582752 -816055205 -3342460 -5246976952665638015 -14212253575230457510 -576783198 -1842511416005692464 -806159226 -5566476498435574920 -15292217517958891614 -13516735047310051359 -5728764487730398405 -468608617008 -4025969582498383295 -16044698410490725659 -1519546451849645365 -9870724567599405 -5566476545724581156 -5619444426388998007 -98239009 -547095362 -27356033875641745 -219257112483 -8140646021471143544 -4713167439824750602 -16357059045845960667 -5462796881224795644 -9138963602338286574 -21102898 -10905173367761798655 -13701595356116683915 -2477484405147109478 -1880166538706292058 -11206864 -1283692271244348427 -68551110 -5885543833259674054 -18413673995792875610 -2352415791 -14947075702982868 -5299098870103476096 -681145240220994278 -163447447 -331038328206 -38863202 -96207382 -153551462 -2625321606595348609 -5461104757014004985 -10744889200825601240 -1988559907 -258343605 -6517011693716180143 -535167753 -2530175340657839273 -811991951 -15291935475760762248 -4397798264919820154 -18413674025886548065 -12109395139072755174 -475082778886408323 -104071746 -161415815 -8697110475982376165 -15584540329550678645 -13669583335851559254 -2625321610894116800 -1873618441748286746 -18412827963781152832 -819856323 -6209141854797957852 -1783548230307677653 -18411981901675757599 -637928298 -7143606 -15855332315905657597 -2625321864544389907 -12020808312486431384 -3076135121411313050 -10139438201185111279 -6152225744495577231 -33560368941368890 -210659313158 -4278256712 -27638024483702949 -24904017 -32432320525830439 -13263754581809432790 -817824692 -15007995 -359800716494834349 -18613516794268696 -9839328478246341893 -62456234 -5111959 -18411981931769430054 -16219982623696489082 -6261827792145090364 -7692717626264324682 -42664306 -13806855580317125108 -9870724569368358 -16269555352897260337 -214958081659 -11214563466575480865 -15636771529559117046 -13271165719268362246 -2652485274356286816 -538968856 -3784724792312663401 -18263821886743185772 -1986666427421953426 -5565348480114297669 -5352348827359053328 -12976359 -1873618476140725820 -421319345246 -70320604 -11703165067112811597 -21715697223994697 -3757107087862401328 -60424594 -3080312 -10697899350700788395 -1873618527730534170 -468608354196 -509280991 -50528646 -1193603335023233930 -16635669954819197974 -15426482629288462533 -5460499803637156023 -2625321602296318353 -9870724567336570 -97976862 -8818864638845060491 -14288223544298637564 -88080898 -6996745855548787140 -5566476571519223063 -546833214 -220421203678071202 -31022238513759415 -1873618458945389823 -6406389097441592980 -20840752 -813761433 -27356085465188671 -68288962 -5865888353649363875 -109394696450803010 -12213481117926952067 -18413391987988365394 -10944716 -517145329 -5723537903358642458 -21715753112570631 -7758478083289188556 -10675690836223986039 -153289315 -95945236 -11547019543992076059 -9649086479758069023 -2625321606595086582 -258081459 -544801575 -5887799994573980828 -2845029447323880298 -18809125 -8510103668314541335 -6205475701751155414 -1990332636357069057 -429916882098 -2673382969485886910 -1873618489039064439 -18413392018082037849 -10914208898869168291 -3773122177597967623 -161153669 -103809598 -14107087915135404740 -6366071515245381876 -18412545955976642616 -15289397371128645360 -5462796868327967227 -1402930148 -28202057290482949 -797695489810761887 -16777494 -18116142943679220675 -5142301044413893172 -17219576355390295334 -5249797112394286460 -13735950183222348532 -6881458 -29048192479791616 -16896582888638318388 -14517406836956661503 -5458848655886518922 -313840698753 -5197393273133271298 -3861350810962691992 -6375653898722412075 -16885380374869314205 -361129707266 -210659050964 -29048123694646491 -3017170418691476659 -1873618450347593089 -15290243360149277503 -14745847 -72090103 -14546784569801180959 -7431889721301470079 -6364097387529111599 -2435475427475262665 -1873618497636600365 -6151097734773868363 -62194086 -17083693200934636558 -32150372909516328 -4849811 -3172873313800750756 -2150364429944620611 -3862478902367620470 -9305858029919208637 -2625321597997287853 -2508194873 -491258567 -1408762855 -5015996636573993090 -2414921941537785811 -538706709 -5734260728554980678 -22610237 -12714212 -70058456 -6208295882974168451 -32714336929384395 -16643035121679272213 -20023641798084435 -4770547828131824981 -2818164 -1930668198955452820 -13726068529822894439 -468608091255 -5569296714050766113 -17490170188584258190 -8694008299851745161 -7073102484926630551 -155058804 -97714714 -40370537 -2625321602296056238 -1703347206 -15895039144349470066 -5352348805862656188 -3068049059797011246 -5880738612678821404 -12309852946450942075 -33560429128451329 -15289397384024950845 -4767727591019973374 -10682570 -10233718743719545342 -850088361543927300 -2792183694107936667 -1107456968073808590 -5759560470823897206 -162923155 -29612216687004362 -5875369269012203157 -95683088 -294416195335096411 -22279760122415532 -5639662680184522626 -17619012653768771484 -13237708544183762948 -8550520059753138843 -27356042474686002 -249849483538007723 -544539427 -13390152586296232130 -10906513561824594910 -18546980 -1873618489038801706 -2676033356038342054 -6313103561496791450 -2063139881 -6848542126596623056 -160891523 -103547450 -14101293042239958 -6151097653090126690 -1584595969 -12424382439595706534 -17698252132056434004 -4129856573689694799 -16885259953617962521 -12393440069873436875 -32432320527338097 -21433680821684597 -8617826180017097033 -1413046597527668667 -3973491001936446780 -819332033 -17305802226190387588 -1873618467542665344 -16515346 -6619310 -6206321690771522709 -4089771542585346905 -1223976962194278208 -13487493291780736605 -2487491354099451134 -8854886172739175692 -9870724570875039 -2625321593698257851 -1535116279 -6262673798362565305 -91619849 -493028049 -5352348797264856883 -8143564249694210398 -6151097683183797493 -9386257309953099582 -196412070 -3865299044899163405 -71827955 -18613366323088485 -18157949162008873831 -7562235583526800081 -817300400 -4618470194090937269 -4587663 -3932922014897081298 -61931938 -1873618497636337289 -2522831856378710008 -6364097413323754682 -6053028402293443390 -42140016 -12287601267178473523 -2625321597997025900 -538444562 -15991329612793777185 -15291089478142986477 -12452064 -2676033644081056812 -2556016 -16508579235574254010 -805372789 -59900299 -14787093348585572176 -2575517759332551933 -2412665810316625225 -7730749911729375728 -6155298010574883251 -10488220504998020326 -1311572948 -883931539946605906 -5352348805862394041 -2786543383251193103 -546308920 -3346269252 -5782296426993943791 -4469799173763958889 -6205475671656957491 -7872981661881076049 -18116424960081923281 -2676033351739311464 -516621038 -1465168459078698840 -5677488692584514734 -105316943 -4562124351240801677 -5245848874158263187 -16432982289349543214 -162661010 -3971798877726246151 -4787251587800828866 -5875369294806846690 -12217235256243064050 -95420943 -5354604868299326678 -4502324021619918399 -544277281 -5940918086979029952 -2014710471177341259 -2140013610 -1873618463243635741 -18284834 -2676033356038079832 -10531295876509927029 -5458848625792321791 -18411699898170343448 -7410231625909407077 -3478039985316562895 -6204347606046083061 -31586254122912349 -6829167320236755019 -27920101074341046 -13165236096819726043 -32432389312220424 -571933524 -5727354401416743090 -10225919154718574351 -4127600472563058730 -160629376 -103285302 -8483828720842049762 -15740334315622960494 -206359759935 -9813006656186419950 -9319686106503382840 -5515085278788979157 -232154663489 -26149204 -6208295848581203181 -3094190453106412515 -6520986101609793850 -32432320527074663 -5245848925746038203 -5942328186188203485 -1873618467542403595 -16253198 -15881445561639371975 -6357162 -63701435 -15515478115209971466 -5833854247140395797 -283181761 -19177532404009207 -16567374854657149772 -684134257893509654 -9870724570613070 -15680489859993767209 -12826571498698443033 -2625321593697995819 -10329316755526125416 -10754752208794748192 -10758418391935812957 -12105446909435186010 -3143159678306028631 -236453432350 -540214046 -14848239906707278405 -29330157293274228 -684134210602468610 -817038254 -4977791693940394179 -71565807 -1873618497636075077 -807142269 -61669791 -11287403619712895066 -4325515 -13819298136066198 -7734678113259293802 -6098975847429179176 -99222062 -18056758355458722638 -9870724568582655 -16224960573811657069 -2625321597996763849 -4078298757842341053 -17625510063045740642 -10528906628815718922 -490734276 -5412367062202975465 -22085946 -12751507524739009261 -538182415 -12189916 -18413109984482951243 -2541195915421354200 -6671860954713623381 -2893509029140760671 -69534164 -747829823970020707 -6770804071406897080 -2293868 -5566476498434524382 -6534429686359852912 -18412263922377556010 -164430493 -9870724566550039 -154534512 -10167299845199168903 -12754891682880490747 -5250413516934022944 -3315661715940248009 -451651625195343029 -32432333423379563 -5941764217869305943 -2141783083 -283748271730 -10161648493728303880 -5240846595623881868 -67502526 -15618641120352995308 -2676033351739049517 -6205475697451599682 -4023356732265137752 -14986955239351847842 -31304272112126853 -516358893 -2207492698791414354 -477207135345 -1309279186 -105054795 -17859691850682797212 -162398863 -4238330517036600601 -152502880 -18412263952471228465 -257295025 -10905173350565414454 -17498716255300421272 -8881019260503721949 -18022689 -534119176 -18411417890365833232 -6293435910568086045 -9374458755688828226 -820839372 -6153071780807051278 -5909364179964069981 -8126661 -3735453693364143828 -6155045908522469290 -745740842898098858 -2625321589398965240 -12142525752872799042 -160367231 -17958290734101235336 -9523554809025136564 -16892239439269464715 -15289397371127860096 -1736311827 -15991050 -63439289 -6095014 -12484855343804124176 -9658025172156550406 -18067928153034001057 -292345808939 -16572875051796793000 -10542598463376395267 -12772641161582545873 -18413674008690163805 -1544487931 -14737352740221028816 -282919615 -12808641794728789765 -2625321593697733840 -17128487303121020 -1706624008 -14101026494875963 -11214563466576463780 -18412827946584768572 -11966722661119888545 -6156455943247300775 -5300226909920168653 -6004915412369541960 -816776108 -4223816177647290930 -71303659 -1873618476141710425 -12477949191893683608 -417019528294 -9511403338599564690 -4063367 -61407645 -2543805385922512178 -9870724578216632 -5407707525201267705 -9870724568320021 -2564752444 -98959914 -15494005608834598990 -15140097999495498431 -21823800 -12734096628671909131 -537920267 -18412827976678441027 -11927769 -69272016 -18411981914573045794 -2571498445011814318 -10592171188278987146 -2057911839619745748 -9870724566287831 -154272366 -545784627 -17616192489740896443 -21715680027609308 -16886908734816455284 -583336804 -2246313005 -516096747 -2625321585099935141 -620888934 -162136717 -331037018572 -477206873177 -503001777494 -15592058013925444099 -1652810939277510396 -10531295803425490030 -3205882223899445065 -31304323701671300 -28484129580057898 -1873618441749006513 -16893851890367073119 -820577224 -16904712944498838074 -1394017249 -17760542 -4160689491693538063 -4047541379259827663 -7864513 -14219872676477209184 -504169174 -17244622751296785814 -2625321589398702921 -4278977611 -7239633818635733091 -5462796868326918190 -1334641629 -73073152 -7460569593843485201 -15287141188316891641 -818545595 -9339868219275806468 -15728902 -5382561551670903978 -9373330690077689939 -18413392000885653589 -5832866 -63177141 -438515402871 -2373415502940997016 -2148672322930150296 -168849237244054062 -12339564610979564477 -8327325764367420682 -7630443591734791098 -12608147700378373379 -9870724570088730 -2150364451439708714 -18412545938780258356 -13221120945827219803 -492241614 -4129856608083381232 -15740733274947783803 -15858116883009440274 -1873618476141446514 -816513961 -17564225130023161250 -13697261 -10668197763104573447 -71041511 -5357143003026951378 -31022281504720056 -1873618501936351339 -3801219 -442814170389 -5701610621477129021 -8520914754064026558 -15289397306641222853 -108593749 -98697768 -9870724568058057 -5780604294184830225 -156041850 -5192881006389626514 -32150304123324262 -219257572663 -18412545968873930811 -5249797099496672683 -11127945220196076778 -9103100569952650951 -11665621 -421318034537 -17619012718254098754 -14443179094226111164 -1873618480440216958 -69009868 -10594427319499622429 -814482337 -13968724050119231192 -28202091681875145 -27638110466671725 -16166203682344470241 -1712194570 -472907842721 -507970270 -15580874172203795679 -23689855033805297 -154010219 -17092164759424403479 -12893049762838873864 -6877309693745106245 -545522479 -5887800020369606783 -14977809576148535095 -19530026 -14105033451515939293 -6795216411027442152 -2543452128325209336 -1385890784 -114426460 -6444189713816225654 -6152225714402364510 -524384476410219715 -17953567922355439196 -17113993018971653874 -573178715 -515834601 -17090754617222956318 -161874570 -1538130937 -47186305 -30458188512103543 -2449021711964768402 -2414448843017751282 -5214737420442796133 -505938649 -2625321610894575340 -13965057806789381527 -970700105235760464 -15223822230290106035 -16285378285009240167 -16940455997476965252 -2601013084734032090 -5248157445900799208 -1580068669843704469 -15043322265989680207 -29048166685607288 -3863606942184311140 -820315079 -17045009756596405420 -29048192480512516 -11510172448171493799 -5885976160280708469 -7602365 -17785259896117529586 -8856014216854897981 -14477731067643038195 -1873618514832657292 -2578187325 -15292499491370895395 -33560368941827284 -13146357072728951328 -17353152791227993245 -159842942 -15530553734630409457 -5569296726948055802 -494159375523777824 -1812923415 -6366353518750729401 -4278715465 -17097308613030775025 -35258719 -1899651063193471062 -12103109825679658143 -6364338522051512284 -2429880031182916564 -11621189233770302317 -72811005 -15466754 -3880024017885400135 -818283447 -62914993 -4076606625033226775 -1873618497637320883 -7746405201714873917 -5570718 -10859426818132543221 -6925759835249836137 -3506237898852665380 -23407812836853915 -1873618523432225060 -17166316876055971050 -18008952305986046279 -43123062 -9870724569826462 -7410173966093388838 -33560399035500221 -511599051947 -214958540605 -13237708557081051143 -20587696099952690 -15339421027537585423 -6104586261132347910 -11103300151687644832 -1456931819 -1873618450346281005 -9181531069949872018 -14650572868605052119 -17783567759008991682 -575239712866634722 -15288269284022357372 -6206321673575138470 -644219759 -13435115 -399811749952817933 -145335345147610979 -70779363 -6366071455058494624 -7529998377695250462 -519635711 -3539071 -576979807 -9568723490388248888 -634323816 -13012951802393594980 -853643387796785445 -98435620 -28766107292140894 -9181555677596944971 -5195701200510977145 -5129024196560096606 -5831598124518278362 -4844858457232050089 -219257310372 -7569568047215545466 -5461104800004441485 -1518418407735101149 -814220189 -11403474 -18005251247539029895 -10333839787251271664 -1836516380 -8054758354584013306 -507708124 -163644058 -9001701177466488459 -2625321606595545096 -153748072 -4787251587801811388 -39059811 -545260331 -2036204584 -5356296971014964874 -19267879 -9714916684781063078 -3055188874828713383 -14576212124415364447 -2150364417046743283 -4662355849599126556 -1372824966366170355 -1318388695 -15289397293744393060 -8423108281783224429 -505676503 -104268357 -477206348880 -5831598081526006949 -4625631396377398109 -2625321610894313322 -6206321759557388696 -12237654281284815334 -17236251 -9391897711091583990 -3891732840317912522 -8856014216854636141 -5758903550139959418 -7340217 -638124907 -810156929 -6206321690772243584 -112132697 -15287987228927658628 -339636063086 -7721139320100816372 -684134305183500639 -22279768720672168 -5831598111619679502 -14814059355306855043 -4211213383 -15290243360149735302 -18411699880973959188 -15204606 -11507341268100646834 -62652845 -6365225483234117329 -5308570 -3491703531359374171 -17791918762976347730 -4127600455366674792 -11130039777759856047 -13951205954302381098 -18115578910873816258 -8659114857360722535 -6153353844499089111 -157549179 -9870724569564298 -16327183209838150989 -491717322 -214958278120 -32432303330691092 -17684252729367202593 -16965951797418331227 -23068994 -2272905061487347697 -1873618450346019367 -7515799761807542411 -815989668 -2576363817137867614 -70517215 -17763448248357489818 -13172970 -3276923 -806093689 -17621268802185464283 -60621205 -18411699911067631643 -576717661 -1685722535145180234 -23689824939607125 -17256155806064642777 -5516892801706297876 -12982659022915898414 -9870724567533791 -15515140725455259155 -547029825 -219257046468 -4180850416920431050 -21037361 -68485573 -11141327 -813958043 -189614828176542708 -1873618480439692390 -279448454880 -16253215886083360174 -572110149897422243 -9896616181508082455 -153485925 -8021450371307931626 +2507605048 +17607182983838566334 +546439994 +2679637056 +41812332 +99156525 +9140909979694467183 +11742834345080916462 +9950583114428779661 +18411417894664929297 +17160329975787685834 +1518418407735101149 +18331556279224644 +15289397293744393060 +13950077918785243724 +15287141235606883137 +2789363555875490728 +491913932 +90505732 +214958474959 +21692726 +2063139881 +9870724566418979 +339635799228 +11740202404159819072 +12769623874203027091 +7171706723174648069 +16156684754098128751 +6208295835683456476 +1873618476141316771 +5882159696614657105 +3431717683755289915 +1873618506235448739 +17166316876055971050 +1023125932615797552 +22279798815198594 +12346762090311779845 +162136717 +331037018572 +13041896 +1733362705 +643826540 +2306802734 +477206873177 +17309267256018536307 +2625321597997222413 +517669620 +620888934 +70386141 +31022281504720056 +7409502855497715015 +6155045934318095559 +18412263918078459945 +5458848625792321791 38797665 -19177566795402134 -27356016680241600 669582195 -2625321606595283106 +27328854 554894151 -5512098557251945790 -9568883447315500158 -1440671446449589035 -4502324021620638916 -3249068390006196153 -15292781563660995825 -821822415 -27356063969248337 -18413109967286566983 -10911952793442192048 -6064503826171693679 -11161692095903435283 -1004761907965660269 -2207210695286917386 -6388664954993575829 -46662016 -5885976061401368013 -104006209 -5572809636517250553 -2625321610894051277 -17955470565775510239 -4661227814082512385 -6368045642960996241 -5463642874544129714 -16974104 -533070599 -809894783 -18413109997380239438 -7078069 -637862761 -6288511205539515238 +468608091255 +15859976 +4287350254045103750 +61735328 +4391052 +6520986101609793850 +153485925 +8510664359869943178 +11050161621988804687 +20869691006257762 +5196265237615086368 +3491703531359374171 +1873618489037883086 +11633356565151157114 +16633695839999756254 +23407812836853915 +1873618519132015281 +12074216556992400767 +6153071832396467338 +16120716880803858984 +5299098848608717979 +17149817495305717193 +18411981927470333989 +3308118261903721908 +5831598124518278362 +7209143 +810025856 +797977540607610221 +98959914 +7470284155521404014 +2564752444 +1727529996 +12318365723907459763 +5884848047378859255 +13222096480399396057 +6314795724398267312 +4397798316509039896 3974700764184054454 -18613559784442970 -2791055594105669609 -4504298205224635444 -18412263935274844205 -2605266760616185153 -15287987228927396675 -339635799228 -92078603 -8501910827968825512 -5991347884504386492 -210659247559 -17284241873202253123 -16893851873170950707 -651404368114879038 -18411417873169448972 -24838480 -5726226344404977639 -10259573046193883986 -2676958769323838072 -72286714 -6886936648282539655 -14942458 -521143041 -5046422 -13980703149896829784 -1495991284 -62390697 -18199185222634702635 -8834282535679560676 -15925946803693423456 -42598769 -9870724569302153 -5459976661309982295 -11084138473134491150 -5303047078245827995 -214958016090 -12451287838412704489 -5509410202188647833 -2681814701524780811 -10628953736434486617 -9774054990929462949 +5514354709969504081 +2893509029140760671 +1873618514834032208 +5516046791188875281 +1223976962194278208 +14737352740221028816 +6368045642960996241 +3489447322753895731 +21496117 +9870724566222277 +514654951 +189614828176542708 +214958278120 +491717322 +571999061 +6367324841362000185 +10375505339483621145 +8070938101082033295 +5569296709751605280 +1316357237551401394 +12020684574736647824 +15991329612793777185 +10697899350700788395 +16739161091967945306 +3891732840317912522 +1899651063193471062 +161940107 +24314188 +2224234517276395067 +17082847207615301902 +2625321597997025900 +6152225714402364510 +12845285 +506004186 +1733166096 +70189530 +10906583445476542150 +563348302 +31022281504523376 +1873618527730601376 +2530175340657839273 +1873618497636468806 +1873618441749006513 +18412827950883864637 +6366353518750729401 +1413046597527668667 +4078298775038527628 +5565348505908348542 +4022831255438034250 +153289315 +4194441 +15663365 +11700344903086704893 +73007615 +818480058 +296644709200 +95945236 +2150364417046743283 +30740204914804024 +15290525359355331959 +4237766475632413481 +16758489844492275047 +5408700909154273182 +5153885660580611393 +1873618519131818153 +13951205954302051853 +3597357340772992368 +7432602967203907143 +1880166538706292058 +399811749952817933 +10381427610856195682 +4644563108626631009 +14665351642484132 +7012532 +5141736951422061236 +3344434243 +16330874579771459902 +1873618484739705502 +8550520059753138843 +4645667113710455153 +5885976069999102359 +15501473033754248808 +9896616181508082455 +5462796868327967227 +14585410861575638263 +214958081659 +339635406848 +11818157132458100908 +11526999220155450649 +18613533990193666 +1873618450347593089 +3861350810962691992 +684134305183500639 +18413673995792875610 +15530271666638163275 +17621561863500597513 +4238330517036600601 +22279768720672168 +4502606072414800438 +10655291933708585535 +161743496 +517276402 +2625321597996829544 +12648675 +563151692 +1308623824 +104399431 +236453432350 +4279043148 +540214046 +1744643526947965735 +2065251783916127931 +18411699893871247383 +5459976691403131036 +21715680027609308 +5726226344404977639 +15292499491370895395 +18413392005184749654 +1873618497636273356 +32432320526091507 +31304323701802109 +2576363817137867614 +1631882651478526261 +5995296157134227520 +7558005371879033601 +61342108 +95748625 +520094464 +15466754 +3997830 +5240846595623881868 +5887800020369606783 +15288833278135765839 +818283447 +72811005 +5459935770830768809 +9355193091131312182 18411417903263121427 -3865299049198390675 -12910822 -5356297009705911966 -2421359666 -70255067 -2248112069177510680 -3493395634074945822 -60359057 -12654580528992553525 -519111421 -3808100888100343209 -3014775 -13513632858283052077 -15289397310941235057 -8861613698626554738 -9697577994188492052 -155255415 -10381427610856195682 -9870724567271440 -2625321602296252770 -14512708438227029368 -97911325 -489423554 -4022831255438034250 -30671195 -1873618458945324208 -20775215 -5459976691403654584 -813695896 -12665415616966166285 -5645056620059298667 -68223425 +8430474765433179073 +5247593352907000017 +27638110466671725 +32714414313178248 +9234597529149245860 +3229097897723824621 +7449919019336600171 +2413828615876118471 +2414448843017751282 +6101795942670075923 +7697026996393675938 +31304285008889193 +15777957523720635747 +3143159678306028631 +11065487220741573048 +6815921 +2140013610 +14282671233461718187 +9230753948926543533 +98566694 +2625321585100065781 +5382561551670903978 +259130038 +155910777 +87097857 +18284834 +282067642 +545850164 +33278352538406314 +21433680820701635 +5625593289148533238 +10512763733196344280 +3784125305237867347 +1873618514833639109 +32432337723461001 +1873618454645376983 +15292499534361593441 +5133829485925763690 +16904712944497920326 +5511666277521099409 +5622264654903379074 +571605843 +514261733 +491324104 +2625321606595414132 +21102898 +1385890784 +524384476410219715 +17257283880273643533 +5195701200510977145 +10280579133800254203 +200191596416994196 +1873618476140725820 +13117263636444153530 +15096032016463431129 +6729754102968812754 +18412263926676652075 +31304272112126853 +12118995007347033900 +1996555300 +9870724568648556 +540017436 1319896024 -2390363305266056430 -17634738504986593825 -20305632407192782 -17462509665872383079 +4236074403011431549 1606616067 +195953314 +23920969 +104202820 305243098454 -163119765 -48431492 -10590197086357423689 -2787671431665157349 -6366353484357502971 -18413674021587452000 -17620986833073014515 -105775699 -20869665212206112 -4445946672738929841 -95879699 -2625321606595021110 -10906583445476542150 -18412827959482056767 -17205553309096938840 -12294570438877711433 -5461104782808583112 -544736038 -9950019055828534995 -5991347927496394467 -811664269 -5403008449516603011 -18411981897376661534 -572392279 -7677136701370927115 -6155045908523191668 -18067928196024961188 -20587511236070012 -103744061 -161088132 -335336768790 -6155045934318095559 -13322381941750499717 -15291371425760087333 -30740222110467489 -5245848925746498573 -5349308051975768286 -4548309565419816229 -255984301 -5461104787107351969 -16711957 -10906583475570214623 -6365225453139920066 -6177363118375897150 -6815921 -7032232753418799293 -5558136817694803400 -4030203865610717075 -12718336251608304605 -18411981927470333989 -1545208828 -15287141235606883137 -5837238474067478018 -11705421198335413148 -5524868651610213131 -210658985303 -6098975770044925746 -24576334 -13151687854617134836 -4662073803102881076 -72024566 -817497011 +12452064 +5248157445900799208 +31022281504130688 +1873618497636075077 +1454041666728626384 +1873618441748613384 +15289397310941170468 +12999620585941961275 +5875369294806846690 +18142877345697171235 +2789363542978135882 +18411981936068526119 +12057284352529139627 +356832576945 +17092164759424403479 +1460339693 +3801219 +256115374 +1987904546 +1964966944 +15270143 +33278386930321618 +442814170389 +818086838 +17462509665872383079 +6206321759557388696 +5408471212899110030 +1873618523432225060 +17353152791227993245 +6261827792145090364 +15223822230290106035 +15287141218411547437 +7576852735584046364 +9714916620293637762 +31586271318836879 +41025899 +2625321585099869531 +223556471268 +545653553 +4023356732265137752 +98370083 +1531970550 +6619310 +23689824939607125 +9950019064427710530 +12424382439595706534 +1873618484739311861 +18331530485106038 +23407864425745813 +797695489810761887 +15289397353931868100 29330157293733695 -17096567568145714575 -1454859013759438228 -14680310 -4784274 -62128549 -1493907215600323645 -6364097387529046615 -12583654612056476062 -12851509922494416016 -1495729137 -15287141218411547437 -828143439367899804 -2523959969279970191 -3919394969679695174 -7595953279435999504 -2625321597997222413 -491193030 -1839046019115124804 -7241043922144659849 -18613499598604650 -18413391983689269329 -10594427319500605883 -12648675 -4861149623842704773 -5782296448490276391 -5516046782590617836 -518849275 -10015828607276288922 -15662612681012938353 -2752627 -60096910 -5133829485924779401 +6101795994259359091 +7692717626264324682 +5299098870103476096 +9813006656186419950 +3591160524196219107 +4129856608083381232 +2755882956846859930 +5352348797264856883 +812254097 +5524868651610213131 +11124082762859154482 +2857298572705729021 +19177566795402134 +18301216972081072101 +2625321606595217579 +6152225753094686522 +548471621 +5512098595943810546 +6584302816815089825 +11092808190891527547 +5941764144784016877 +18412827959482056767 +14428481252717692252 +5301355009924597043 +12284124821458521275 +3577503648415550265 +9870724568450966 +69599701 +7431889721301470079 +46662016 +35193182 +104006209 +12255453 +17877509356004052233 +11069796553860646809 +4347925833116091776 +10590197086357423689 +5570988786672405753 +9297735470756268616 +14637903957824965363 +539325825270679628 +15584540329550678645 +17247780946628644032 +15073532 +1574831104 +72417787 +152699489 +496763604 +577045344 +817890229 +3604608 +6204347606046083061 +12771845711499103316 +15290243377345399572 +11127945220196076778 +6208295882974168451 +11846037961957312985 +13106160036035691955 +1906986264008723742 +4657106642463886071 +9198943117821938833 +15270695523793439937 +5246976952665638015 7003516464553396964 -12903069678853164419 -2625321602295990612 -97649177 -259785401 -5464488953846367762 -546505531 -30409049 -374027977988 -1396769762 -21715680028329254 -5637072609524124450 -7731877951544692100 -1873618458945062288 -6767393152337644543 -9467310877347154547 -5429433323061448040 -10617033 -1730937871 -107356700000258304 -425617786716 -451412690018 +10956724774926813288 +820708298 +545456942 +63766972 +4585257123188181630 +6422699 +17891616 +9117971362513815455 +18413674004391067740 +14597841535548131734 +9772926989806013640 +1873618458945784014 +4522983015860209939 +13806855580317125108 +15426482629288462533 +3506237898852665380 +5787083718919720300 +13322381941750499717 +13237708531286474753 +32178524 +490930885 +2625321606595021110 +20709678 +1706624008 +513868514 +245051624454 +525337347 +16483453074211931840 +12217235256243064050 +4794173760728861833 +5347351719937377689 +18411699902469439513 +30458205707308380 +10750398672578676906 +13351948495571782591 18413392013782941784 -12020684574736647824 -105513554 -3541851256594893702 -16038494049631274933 -497025749 -4661227783988316231 -18412545951677546551 -5565348467217401524 -14428481252717692252 -544473890 -3344434243 -2169005683868174908 -5993603989931887912 -12972952285742288 -13117263636444153530 -811402123 -2676033356038276482 -1873618514833639109 -514786024 -572130134 -160825986 -1938490399 -10280579133800254203 -285938493736356261 -6425213859614951480 -103481913 -11364576519499679975 -1881294612915292853 -15739206202722094240 -4397798316509039896 -17011915733784398286 -1873618446048496233 -14383326641327005 +17088031212435737557 +5105416417023100899 +11427331956784106382 +3698377064120454404 +69403090 +1629160452 +161153669 +17153706162049649384 +103809598 +15580874133512784221 +7872981661881076049 +2544766567488424310 +8818864638845060491 +1597536180772867045 +17631161371525515669 +4977791693940394179 +29048123694646491 +15288269284022357372 +806224763 26345813 -6156455960443095577 -14975681650483333306 -819266496 -16449809 -15288269301218674108 +72221177 +14876921 +60752278 +3407997 +152502880 +2625321593698257851 +5675796551176948530 +5337229686545450161 +10649664295314066054 +18271599167641487963 +5741055947585816645 +1873618523431831649 +9763237054719266042 +5778348175860436286 +11906248855590275274 +145335345147610979 +27356063970624739 +2676033356038407648 +6226088 +7285785859232107205 +2036204584 +109445719 +5779476228575003910 +13117159209037203716 +97976862 +545260331 +1839046014815569788 +23407778443758207 +13885455448020813663 +17091318705916150977 +14749580058850363919 +32714379920475611 +5511666307614640107 +5780604362970367638 +18412263935274844205 +4767727591020628301 +5885976160280708469 +1396769762 +811860878 +5996988225456246061 +9887461037680036022 +490734276 +295544243748734701 +30458205707109873 +9950019055828534995 +17090754617222956318 +5566476545725367766 +17648172271756969893 +15289115277341755866 +30176189305784773 +6205475701751155414 +9940375093616053431 +5728764444739437994 +15140097999495498431 +6523880655054768550 +5727354401416743090 +210659313158 +1456931819 +11862232 +527958790 +103612987 +4278256712 +9870724568058057 +69206479 +1997667722089860216 +12339564610979564477 +5243108696682924272 +23125779236849772 +1873618501936285414 +16044698410490725659 +13669583335851559254 +14425661019905001872 +13681696359428851594 +16161820259100396848 +72024566 +24535844054893958 +3211386 +817497011 +9870724570875039 +60555668 +26149204 +24535874149025753 +806028152 +232154663489 +507839197 +14680310 +2625321593698061230 +15273884660262766886 +6633286351216577743 1873618493337504776 -5782296461386581535 -12162857194684744950 -16633695839999756254 -6553773 -6206321690771457172 -5411573444917201071 -14273081993166850387 -17297538988880889355 -9870724570810095 -339635275824 -101450287 -2625321593698192308 -91554312 -3812049113439014303 -492962512 -15289397349632182266 -342928503145892901 -9257009393629660721 -13674941621707869313 -17952462371364276975 -24314188 -7676326001635166459 -12622921449567619867 -14471968401314024391 -14418163 -71762418 -4522126 -1873618497636273356 -1873618523431177265 -31304285008889193 -2625321597996960522 -42074479 -18895601982637667 -14883032307819284131 -32178524 -490930885 -5459976661309458015 -194314911 -1873618454646032908 -9386257314251803173 -13950077918785243724 -5831598146013367591 -5882159627828332650 -69730775 -6100103913039400051 -15744000533156660854 -12386527 -518587129 -59834762 -9231865831523354279 -2490479 +3094190453106412515 +1087507565178193378 +8481290603310483360 +16885380374869314205 +5412367062202975465 +1372824966366170355 +2543805385922512178 +27356033876298083 +2676033356038210923 +820315079 +63373752 +6813843502384089093 +97780251 +258343605 +155124341 2148672331528407961 -2908260051937332390 -16876615841046071902 -9950583114428779661 -154731123 +6029477 +17632571483632043275 +18349504802666319185 +12133908888583014197 +18412827968080248897 +6100103913039400051 +5249797202674912471 +1873618458945389823 +16271603835638319461 +605557034314828631 +5881866613803452649 +3544107379218385465 +7406761759861377295 +811664269 +3234866947851553000 +43254135 +10675690836223986039 +3346269252 +17946608694533885076 +5459976644113468289 +15290525376551717170 +5946696443085589628 +3477193953305167424 +5353476789790574588 +28202091681942395 +5944020284604679310 +8143564249694210398 +31304332299601191 +8856014216854636141 +160760449 +28766090095429261 +5727354401416546168 +421318034537 +814482337 +103416376 +210659116497 +15505670362359531298 +16893851873170950707 +15515478115209971466 +46072191 +11665621 +9870724567861628 +23134531 +69009868 +14105033451515939293 +1784112314702629632 +32714336929384395 +853643387796785445 +4713167439824750602 +3812049126336301758 +16130159995999161574 +15289397371128645360 +3910652327921846506 +519111421 +71827955 +9870724570679212 +2625321593697864817 +60359057 +326737659857 +163578521 +17219576355390295334 +197984938 +817300400 +3014775 +18413674012989259870 +27638084672359236 +6156455943247300775 +18115860927276518423 +18323667541285735009 +5572809636518234139 +2581508047683782932 13237708539884666883 -30458205708158447 -2964529530791004471 +2524806027085153844 +2676033356038014277 +31243224015702806 +12982659022915898414 +510460641 +464308734399 +2219404100148201532 +544867112 +438515402871 +258146996 +5832866 +97583640 +63177141 +21715697223994697 +14657391675119111356 +18411699911067631643 +1873618514832657292 +15339421027537585423 +3545799490531822688 +16916327697533962956 +5299098844309358384 +4127600472562141528 +8920676095134336910 +5133829485924779401 +6151097665987347595 +6152225697206437786 +1706034183 +15897859265386515143 +43057525 +536216330 +943759021439519007 +10939233345785957508 +1746899701160150410 +1384869222252743071 +5881302580997523790 +107356700000258304 +11287403619712895066 +814285726 +68813257 +279448782049 +539034393 +22937920 +210658919829 +493159123 +91750922 +5831598081526006949 +103219765 +45875581 +516096747 +9870724567665640 +2625321602296449309 +5568582435113995179 +6154199872212897041 +5356297009705911966 +9001701177466488459 +6425213859614951480 +5129024196560096606 +3971798877726246151 +6471725260172231870 +18412545934481162291 +6155045908523191668 +27356042474686002 +11226550812567014265 +9870724570481756 +417019855960 +12512221777814685432 +2625321593697668091 +18116424960081923281 +14287090 +2818164 +71631344 +18895601982637667 +18066207382028748450 +5353476806987745228 +6100103891543657570 +6996745855548787140 +10594427319500605883 +575239712866634722 +3870997034531752803 +7239633818635733091 +29612147900877606 +8865366478519731984 +5352348805862656188 +8709732826087622782 +18412263943873036335 40042856 -2933734509745341832 -5459976691403131036 -1730675726 -1873618484739705502 -2676033351739245930 -15215179494928287321 -14866462842593414402 -5463642917535614049 -631243623 +5245789596497153220 +819921860 +15580874176502892132 +154731123 +544670501 +62980530 +17105177 +4254959725487523541 +5636255 +30458188511970924 +3973491001936446780 +7410173966093388838 +8704274751914773568 5885261859847867262 -11391362031143292020 -506659547 -105251406 -5778348197355914873 -16324853745603185849 -5509410163496651347 -152699489 -15292499534361856724 -496763604 -544211744 -4078298792234977417 -5461104782808057591 -14648423506775771515 -10504814416598927327 -8709732826087622782 -2544766567488424310 -811139977 -17088205463377873568 -15798241638577276499 -2676033356038014277 -2785415326238639918 -12562453432512743836 -12350988444867431112 -1873618514833377412 -16940553195690134509 -45875581 -103219765 -8854886168440079511 -5941764153383128192 -2625321589399162008 -11818157132458100908 -2785415278947600352 -15257764832492062794 -232154598652 -819004351 -16187661 -4644563108626631009 -4000515045253449269 -16872667624306444468 -1873618493337242815 -6291625 -6156737968247080128 -292346005443 -283116224 -3220426554520570467 -12356593998396393868 +13590665243542948895 +2412665810316625225 +18613516794268696 +1873618514832459339 +11623400942792738969 684134257893444250 -17175427809786595961 -9870724570547380 +9126223058424237061 +10530167802300925091 +14267039342724252928 +7042731044489660311 +811271050 +219257507157 +16204201012116260706 +19923244 +2248112069177510680 +19741625396299098 +14311172801615955497 +313840698753 +157549179 +4662073785906890031 +9658025172156550406 +6364338522051512284 +3599049409094225259 +6177363118375897150 +1801912319444323213 +11272401 +91554312 +2625321602296252770 +68616647 +538837783 +1411918553413126104 +22741311 +492962512 +451411772583 +160367231 +9870724567468020 +814089116 +2528483177756102046 +10370417990725208293 +6829167367527204602 +10167299845199168903 +14284319595218536933 +18413109967286566983 +9881712940254169714 +13819298136066198 +10906513561824594910 +9486667438472824267 +10215782083327823122 +3685635809078676834 +518718202 1992881803100621054 -2625321593697930351 -9450798976826149302 -16655465042802838677 -6474545510181176536 -11740202404159819072 -15289397349631921063 -9714916620293637762 -6098975770044401989 -16364556117061994922 -196084388 -540148509 -24052042 -11065179658016983681 -12480382642832672298 -71500270 -7285785859232107205 -14156017 -17632571483632043275 -61604254 -4259978 -17750109864738752812 -1873618523430913566 -9830100417878166271 -14425661002709010016 -4794173760728861833 -464308734399 -510460641 -2507605048 -41812332 -2679637056 -99156525 -16044698410491643447 -9870724568517151 -5516046735301085409 -6261263733545503259 -3759645248384009814 -538116878 -5779476232874035736 -6104586261131037638 -10531295842117158093 -12124379 -69468627 -5565348505908348542 -814941090 -5299098870104394759 -14322284629040564382 -10440328872292254866 +14090480 +59965836 +1550910461 +17063697102470777209 +71434733 +2411070513 +17639632887023929831 +805438326 +2621553 +9870724570285675 +1192315333980326167 +6928358494714596151 +5512098613140196086 +15611911946388048179 +5214737420442796133 +5778348150066318224 +27638024483702949 +18412827976678441027 +8881019260503721949 +5837238405281154301 +5461104765611674055 +4181978486829943864 +154534512 +5439644 +755605599842665887 +62783919 +292345154665 +544473890 +567411536 +15257764832492062794 +15122676476569913436 +5835546375651263675 +5516046795487905107 +10758418391935682553 +27356085465188671 +11400819049620310987 +5245848947241584851 +1728578573 +42664306 +219257310372 +8257735 +524354306 +429916226796 +4076606625033226775 +10162787574158199843 +6064503826171693679 +1873618450346019367 +5566476545724581156 +2704904932763174902 +4548309565419816229 +12484855343804124176 +879419277503368073 +6153917830015027393 +573047641 +68420036 +9870724567271440 +2625321602296056238 +2531021389865944442 +11075790 +102826544 +6064762127302393958 +5249797159683425776 +18413674021587452000 +31586340104504804 +4469799173763958889 +13237708548482859013 +31586254122912349 +9870724570088730 +331037869860 +417019463453 +48300418 +71238122 +1539245050 +644678512 +2424942 +11305519054433421356 +15739206202722094240 +18411699919665823773 +4787251587801811388 +32432320527338097 +27920126869375123 +18008952305986046279 +4661227783988316231 +2543452128325209336 +12826571498698443033 +16711957 +4160546838840674266 +5245848925746038203 +62587308 +5784522635265967958 +5243033 +544277281 +6211116016906930204 +16253215886083360174 +23407808536904833 +9821766011288487605 +2676033351739442523 +8061124 +13012951802392676509 +219257112483 +547095362 +1992881785902860007 +19530026 +810877831 +2625321610894640833 +4825924017323379312 +8854886168440079511 +3808100888100343209 +3493395634074945822 +12591757708863407913 +5349308051975768286 +13361048879788721990 +4074350485214726972 +1626046306171619904 +8617826180017097033 +13513914891881875478 +29330183088506125 +18412545943079354421 +6405261049027955879 +17939922315943150958 +1410073577 +7837634943338155161 +85348920764927349 +2625321602295859611 +813695896 +538444562 +29048192480512516 +45285754 +68223425 +91161094 +10184384893691038634 +17542946455516547053 +4180850416920431050 +1928986057181235415 +6364097387529111599 +15289397371127860096 +2522831856378710008 +5885976061401368013 +21997756618246082 +18412263952471228465 +816513961 +13678484518713821516 2228331 +2531021385567766485 +197198505 518324983 -16872385650894636566 -6284197438710222140 -8098722631875955846 -5727354392818878727 -9870724566484489 -154468975 -2292825785040636736 -3172873343893834792 -14418466534433295118 -2707725182771857350 -15293345523383077603 -259261111 -19988781 -15371922320578972378 -19741625396299098 -18411699893871247383 -12818875419963886521 -2676033351738984017 -14268291611706526293 -1309213649 -104989258 -6367324841362000185 -7432602967203907143 -11331649863678691999 -15292499534361593441 -1815413785 -5778348223150556659 -5572809636518234139 -11408348231855703653 -2446197814 -13001682102565734253 -17186370630874106258 -2785415274648570354 +71041511 +9870724569894177 +105448017 +5779476207078475458 +13980703149896829784 +13697261 +12769025487284210679 +2150364451439708714 +4162099616697747321 +1873618497637320883 +1873618523430651633 +12716605781588708278 +5248951136269502637 +11703165067112811597 +62390697 +5046422 +521143041 +16515346 +2625321589399096275 +154141293 +1495991284 +165610142 +10528906628815718922 +555549513 +819332033 +567018319 +1095239150174145285 +1873618458944406678 +6364097374632348674 +10811668319096800853 +1465168459078698840 +17269866578628118199 +2676033351739245930 +7864513 +19333416 +11034422950646711803 +283116224 +2625321610894444316 +546898751 +18411417864571256842 +2372569380647405649 +12754891682880490747 +18413109975884759113 +3332575624131774585 +13536993689470216 +15291935475760762248 +6153353775713551670 +15289397349632312454 +9373330690077689939 +29330183088310857 14264783202905229777 -7171706723174648069 -820773835 -4645667113710455153 -16425638839461284611 -5353476806987745228 -1840738151924108521 -6153071806601889790 -810877831 -8061124 -5356297048398365877 -4770547841029572913 -12804866717273491655 -15580874133512784221 -514261733 -571605843 -12346762090311779845 -102957618 -10907429529076434052 +5995296139937712949 +12269921124804135698 +5885976155981547843 +4129856573689694799 +538247952 +3598203394278688718 +159777405 +22151483 +1409876968 +45089144 +10682570 +3172873343893834792 +15290243360149277503 +18412827985276633157 +28202117477106938 +2057911839619745748 +1193603335023233930 +1410790466305853323 +1873618476141774348 +16643035121679272213 +8716776878113885241 +14581169549127125939 +6375653898722412075 +8694008299851745161 +21997756618049241 +13500652 +816317351 +9870724569695611 +105251406 +70844900 +4279895118 +197001895 +24969554 +15855332315905657597 +151126621 +506659547 +5142301044413893172 +1917322269 +3137288237381257087 +14409153078548760239 +14633483086300318059 +5780604315680310424 +5572809636517250553 +15592058013925444099 +17244622751296785814 +27356063969248337 +33560399034844286 +62194086 +153944682 +4849811 +16318735 2625321589398899121 -5354604872597767596 -4279174221 -27638024484621167 -8483828720841721486 -1459422188 +9374458755688828226 +15882855708139391266 +10225919150420460684 +15292499508566297469 +14065038233622153931 +17943317531893566468 +7031431794891230329 +16385074671182940805 +6205475624366966000 +6103488079777762245 +18411981897376661534 +9796067026192895123 +7996312456522828750 +13487493291780736605 +17245750799710423012 +2676033351739049517 +546702141 +7667902 +42074479 +282919615 +5515085278788979157 +810484612 +2625321610894247837 +1544487931 +9511403338599564690 +5192881006389626514 +5778348223150556659 +32432363517642192 +14046027923586223423 +18413674030185644130 +10771469979967884371 +2229804632046437624 +17480593072628501093 +6368045642961454306 +13237708557081051143 +13331692090551241408 +7677136701370927115 +339636063086 +538051341 +21954873 +492176077 +9870724566681132 +1398211555 +8807886331112851129 +5516892801706297876 +16546579671803956126 +13217980679449939250 +8503320935775669540 +32150372909516328 +1675756474409617568 +7721139320100816372 +2541195915421354200 +24772943 +1309279186 +105054795 +9870724569498781 +162398863 +70648289 +477207135345 +3452050537366752044 +5460499803637156023 +18199185222634702635 +5512098557251945790 +21715680028265805 +249849483538007723 +3554388240579364748 +9386257314251803173 +10594427319499622429 +15289397306641222853 +29330140097217635 +14311736903207619026 +8856014234050823647 +17855463731522440261 +15291089478142986477 +6365225461738636619 +2625321589398702921 +39059811 23689889426704296 -17648172271756969893 -232154335723 -15925513 -10811668319096800853 -6365225478934037607 -9763237054719266042 -11633356565151157114 -63373752 -1873618493336979326 -6029477 -3580814869236944221 -5199085482290645376 -282854078 -2625321593697668091 -9870724570285675 -7449919019336600171 -1839046014815569788 -23789896 -9131616131521448314 -5779476228575003910 -5511666277521099409 -13940760354079114484 -18413109980183855178 -644678512 -71238122 -417019463453 -15131353489256221185 -447360420122266222 -520094464 -3997830 -15096032016463431129 -1873618501936549084 -61342108 -1873618523430651633 -18412263918078459945 -5344573059048999857 -5155859771100236117 -5405598659939206416 -27356033876298083 -2146416200305806198 -5303893093062347743 +2474797953316292924 +61997475 +818938814 +153748072 +4653200 +50528646 +256967343 +468608354196 +509280991 +5463642917535614049 +6209141923583494372 +16122124 +10375505326587315831 +4397798264919820154 +8054758354584013306 +1873618489038145001 +830399540494469985 +5637072609524124450 +13913370016116443160 +18412545951677546551 +2676033351738852867 +99222062 +546505531 +18940198 +5411521012994540776 +2625321610894051277 +374027977988 +282723005 +30409049 +7471291 +259785401 +821756878 +11229884474259868248 +16357059045845960667 +1873618454646032908 +10542598463376395267 +5887104182899575287 +12393440069873436875 +7730749911729375728 +15289397349631921063 +7621890105505615217 +18263821886743185772 +18058417591402760409 +1317733333 +511599051947 +214958540605 +67633599 +9870724566484489 21758263 -3189961199463959445 -527958790 -69206479 -11862232 -6364097396127827248 -1320879066 -365262179507571896 -23689855034002659 -1473119215 -18412263948172132400 -31243224015702806 -39518566 -9870724566222277 -545719090 -5301355009924597043 -9391897706793274792 -11514789185312918199 -18411417886066737167 -5299098848607995194 -2284412389694637269 -10530167802300925091 -10427987387505837891 -14322803714593785119 -2625321585099869531 -6829167367527204602 -6013889919468112625 -4181978486829943864 -8698802578697685482 -1654120425802828663 +1873618532028844481 +8328637003859953646 +5509410163496651347 +15289397375428069113 +16436648502584675667 +6205475697451599682 +8929320279979198383 +4974759074281293673 +9870724569302153 +13107433 +815924131 +2524775482 +28484129580057898 +206359759935 +2625321597997287853 +24576334 +70451678 +16432982289349543214 +5459976661309458015 +4237766514325588729 +14322284629040564382 +9697577994188492052 +5759560470823897206 +1873618527730862181 +6153071780807051278 +15586729198848181726 +5558136817694803400 +10694515171064744788 +1988559907 +5302201063430292982 +4456589 +1713308683 +96207382 +15925513 +13650504529854072320 +5458848655886518922 +61800865 +5779476232874035736 +153551462 +38863202 +8099682237308012832 +18411417873169448972 +2787671431665157349 +651404368114879038 +17955470565775510239 +18413109984482951243 +14911447610171655366 +8858552325786961082 +13840095604897025041 +5940918086979029952 +5723537903358642458 +11238431078799509026 +7758478083289188556 +2014710471177341259 +12454108019635062383 +99025451 +3862478902367620470 +17621268784989013395 +7274680 +546308920 +27638041680086900 +5991347923197297866 +6208295874376239521 +34124418943289166 +27356081166223293 +5461104782808057591 +5357143003026951378 +113312346 +7885152524183078888 +214958343888 +309541536868 +2395604016 +9870724566287831 +491782859 +2414921941537785811 +7528330190111771360 +30458205708158447 +2707725182771857350 +23407795639356361 +5991347884504386492 +14541340640523322842 +14576212124415364447 +525512494730316455 +6795216411027442152 +1160107295621122785 +11391362031143292020 +2421359666 +162005644 +517538547 +196412070 +6152225714402428442 +447112610911 +620757861 +2625321597997091447 +12910822 +9467310877347154547 +70255067 +8926052536804313893 +1873618467542403595 +1873618441749072804 +18116142943679220675 +4787251587800828866 +18411981905974853664 +17175427809786595961 +153354852 +818545595 +5462796881224795644 +15728902 +4259978 +96010773 +1334641629 +73073152 +61604254 +7570985824906512457 +20305632407192782 +14288223544298637564 +12772641161582545873 +13237708565679243273 +15394822466617412826 +18430745393540238720 +282329787 +98828841 +809894783 +17011915733784398286 +7078069 +637862761 +18546980 +7993599274919922706 +546112310 5569296748444387676 -1873618441748940565 -256967343 -5245848947241584851 -15862817677379702068 -14633483086300318059 -288046714075 -2203332276215481610 -7798976 -810615685 -237175467 -11340219378265033230 -313841615983 -513999587 -18413674004391067740 -2116750858326574509 -8070938101082033295 -2625321589398637514 -25099937047839912 -5245848878456439955 -12118995007347033900 -4562124381333884039 -31586327206235137 -16436648502583690678 -9181481831755875838 -5516046752497929091 +5429433323061448040 +1873618454645640429 +16425638839461284611 +32432337723721245 +2785415309041207732 +6156455960443095577 +15292499534361856724 +4727296740454696303 +15289115311734721621 +10914208898869168291 +583336804 +214958147231 +21365044 +491586249 +9870724566091296 +2246313005 +3901910077209972079 4183106466458307862 -1991460714865167155 -17082847207615301902 -818480058 -15663365 -73007615 -3701600990787603378 -63111604 -5767329 -579208034 -1493907215601306869 -11535686880442518166 -3313969578832561394 -2704904932763174902 -6570315963541227654 -282591932 -5726226297114658480 -17160329975787685834 -8843457619279611284 -18413674034484740195 -9870724570023121 -492176077 -30740204914083091 -21433663625497129 -1629160452 -1873618450346477252 -18412827972379344962 -5243108696682924272 -7260902865540482639 -816448424 -70975974 -15287423196122254433 -1873618501936285414 -5151629580948802356 -3735682 -61079961 -18411981910273949729 -7837634943338155161 -3597357340772992368 -5133829485925763690 -51184007 -10956724774926813288 -98632231 -17309267256018536307 -9870724567992379 -29048106498198701 -3544107379218385465 -14386655907412249373 -219257507157 -21496117 -68944331 -16330874579771459902 -11600084 -11124082762859154482 -5459935770830768809 -814416800 -347984565637089693 -11923578915473263059 -575144796 -517800693 -3297856681506178941 -326737923180 -16038494049632258844 -15104099179857577674 -32996413518841137 -153944682 -2152780467316001469 -8722536002903082945 -10646954815923686447 -545456942 -14458654042895551171 -3935742187522887052 -16064731596255856452 -19464489 -17648172288953812474 -6213874949885069218 -14851060135220743194 -6471725260172231870 -4504298175131421894 -573113178 -11701191021079496730 -12314601354656483126 -13957562954616997312 -161809033 -563217229 -104464968 +2989761681675848960 +681145240220994278 +4089771542585346905 +9960543895307946415 +1801847830 1366033375 -1133620930477295468 -6209141923583494372 -2625321610894509848 -5052785364214352114 -6155298040667702671 -5246977012853376412 -4074350485214726972 -27328854 -1873618441748677997 -2000487899013646903 -7465404271946632160 -7239351853821397993 -11742834345080916462 -6368045642961454306 -5516046795487905107 -434216307724 -3493677603186412637 -810353539 -16633695840000739887 -821147663836514852 -18413391996586557524 -7536828 -4151361015346562251 -14540810596246030644 -5995296139937712949 -159777405 -8816997369364548341 -45089144 -18412545934481162291 -9298403582666148514 -15108492788614827244 -35193182 -5568582435113995179 -5570988833963444820 -15289397375428069113 -15401217 -8430474765433179073 -10750398672578676906 -72745468 -5405598728725859379 -9250794030848869727 -62849456 -17422075106091075868 -5505181 -1873618497637255436 -578945889 -13106160036035691955 -282329787 -5570988786672405753 -9870724569761068 -7031431794891230329 -43057525 -1706034183 -491913932 -214958474959 -90505732 -18412545964574834746 -32432303330887118 -846140170598090257 -5458848587099997499 -17607182983838566334 -195297952 -539362075 -5460499872422693597 -23265605 -943759021439519007 -70713826 -816186278 -2207492642904016905 -644154222 -60817815 -806290300 -3473534 -1873618501936022824 -13307798926833551183 -1873618527730926929 -11349795265056081195 -567018319 -9388513449772451585 -165610142 -2625321576501808484 -7290339324003420579 +70058456 +2625321597996894992 +104464968 +563217229 +9391897711091583990 +12714212 +161809033 +24183115 +196215461 +4662073803102881076 +5564423899624572258 +1930668198955452820 +1873618497636337289 +2785415326238575484 +14418466534433295118 +15292499491370961900 +17305802226190387588 +1413046597527538184 +11547019543992076059 +18412545960275738681 +72876542 +6671860954713623381 +818348984 +7073102484926630551 +4844858457232050089 +4063367 +1436145094782551981 +95814162 +8697110475982376165 +15532291 +61407645 +23971751058934778 +13418544886826534789 +17616192489740896443 +8486888319115725460 +5941764217869305943 +5566476498434524382 +17083693235326683482 +12583654612056476062 15287141244205140113 -41025899 -9870724567730368 -5569296739846327213 -98370083 -1531970550 -219257244681 -2065251783916127931 -6151097665987347595 -1407386597 -3973490993339565383 -12463417266756127924 -17631161371525515669 -21233971 -3232498753 -4767727591020628301 -8972557000702888938 -1873618458945784014 -15290525376551717170 -1559626750 -68682184 -12689613402799605860 -527434500 -517538547 -3542979343701772038 -447112610911 -163578521 -326737659857 -30458205707109873 -2625321606595479619 -498702419026 -555090760 -11846037961957312985 +10329316755526125416 +545915701 +6881458 +23689855034002659 +12734096628671909131 +11199980773228349986 +98632231 +12818875419963886521 +6848542126596623056 +5941764183477191834 +3186488476490139978 +3875793754648151904 +14319322726341872694 +17090754651615726815 +3221822110943152678 +5516046735301085409 2286775792223980496 -2676819007 -11599686562536949325 -3968978683605551949 -5831598103022077418 -15175534989820758889 -3812049126336301758 -545194794 -12348736218027264207 -12743882002561631754 -12318365723906541324 -8882845388820581451 -12769623874203027091 -1732546160493595960 -10430737389551487761 -9512531412808567772 -21433723812579518 -812123024 -9140909979694467183 -4025048830681353606 -1873618489039455401 -18331530485106038 -5516046791188875281 -6156456003434055463 -12474564753552836994 -17621561863500597513 -104202820 -29612220986426501 -1996555300 -2625321610894247837 -17489156252859434801 -103179363763095696 -15920335005095365860 -13112992413209136128 -2034107431 -17291573824845253535 -9772926989806013640 -819987397 -17170714 -1873618467543321286 -16156684754098128751 -6925759830950740072 -7274680 -16161820259100396848 -3698377064120454404 -10296839827164892306 -13913370016116443160 -1363739614 -92275213 -210659444315 -1784112314702629632 -5461104765611674055 -507299956084 -13237708552781955078 -197067432 -4211147846 -14657391675119111356 -25035091 -1735459858 -15139069 -14426056237756189706 -12771845711499103316 -9940375093616053431 -6523880655054768550 -62587308 -10967349376607587326 -1873618497636993704 -15290807392954681807 -5243033 -1133620917580466754 -1873618523431898109 -11613165301442872555 -282067642 -9870724569498781 -2141513421469058406 -14318336791419094928 -5885976069999102359 -6153917830015027393 -214958212644 -548995910 -90243587 -16101055855214332856 -9409295256684857617 -539099930 -30458248699119542 -23003457 -252379820 -6173800107753209956 -70451678 -13107433 -815924131 -1873618476140856959 -3188833133853148985 -3211386 -60555668 -5514354727165429372 -18430745393540238720 -5566476498435442740 -8821966780582857359 -806028152 -31022281504130688 -15273884660262766886 -17153706162049649384 -15568274631689570656 -98107936 -9870724567468020 -2625321602296449309 +20587696099952690 +6886936648282539655 +6013889919468112625 +2625321606595479619 +1370042529145686776 +365428606574 +339635275824 +101450287 +17625510063045740642 +214957950334 +6213874949885069218 +812516243 +5463642874544129714 +10906583479868327250 +15744000533156660854 +5885543833259674054 +18413391983689269329 +1873618476140792146 +6177363174264606048 +827351178595273968 +10488220504998020326 +12517601 +5354604868299326678 +7734678113259293802 +3055188874828713383 +477206348880 +196018851 +517145329 +1801651221 +104268357 +505676503 +9870724568714358 +10531295842117158093 +15215179494928287321 +18411417881767641102 +11678577273375754735 +50011031689890353 +1873618441748677997 +9839328478246341893 +2704904949959166469 +18413109993081143373 +15289397310941235057 +31304323701671300 +61211034 +15335680 +164430493 +1884114794138700522 +497025749 +3866756 +4131548702199581670 +17761874897365304540 +17619012653768771484 +16542141154387102177 +4451048243670025981 +2973047420892220660 +6155298040667702671 +6684847 5250413516934940017 -10377197347619277484 -546964288 +98435620 +2625321585099935141 +15293345523383077603 +3324580943442478547 +545719090 +23689855033805297 +6829167320236755019 +5991347923196709309 +12903069678853164419 +8098722631875955846 +17142588721119759321 +6151097721875664077 +5352348827359053328 +491193030 +812319634 +2295119206548507606 +514130660 2429420595 -68420036 -13840095604897025041 -11075790 +2625321606595283106 +10757572347027851837 +10668197763104573447 +1873618476140594617 +18411981914573045794 +14847634415788362123 +451651625195343029 +4278715465 +746324852 +69665238 +15288551330518206781 +35258719 +23789896 +12320990 +161415815 +1491796976 +1812923415 +9870724568517151 +104071746 +5460499803636172954 +10592171188278987146 +9388513432576593267 +2704904949958969710 +16038494049632258844 +28202057290482949 +5303893093062347743 +5780604350074062990 +13674941621707869313 +15881445561639371975 +11405246090117384413 +61014424 +3670145 +1685722535145180234 +8834282535679560676 +95420943 +15139069 +3205882223899445065 +255984301 +1735459858 +1192315303887243384 +20869665212206112 +6925759830950740072 +883931539946605906 +5299098848607995194 +1445774942907271091 +4445946672738010859 +63832509 +17620986833073014515 +545522479 +98239009 +17045009756596405420 +6488236 +7536133444401891169 +820773835 +13968724050119231192 +5701610621477129021 +3068049059797011246 +20775215 +2625321606595086582 +29048166685607288 +15618641120352995308 +5831598115918776853 +812123024 +313841551493 +5880738612678821404 1873618506234530930 -517276402 +32432303331018178 +16219982623696489082 +12258209558853782455 +16436648502583690678 +9387385384162626351 +4151361015346562251 +3542979343701772038 +18412545968873930811 +6741138607599781046 +1709507593 +12124379 +451412624470 +516752111 +161219206 +9523554809025136564 +17951898368652543383 +69468627 +814941090 +92406286 +103875135 +9870724568320021 +2523959969279970191 +5144036663261072615 +30740239306197230 +2089265852158774334 +18331517588211470 +1873618501936549084 +6364097443417820330 +1873618441748286746 +27638058876667848 +828143439367899804 31304293607146613 -10225919150420460684 -32714392818354350 -163316374 -17480593072628501093 -3653991426073234491 -28202143271093720 -2625321606595217579 -669516658 -11075097734987253589 -544932649 -5248951136269502637 -24535874148371011 -5247593352907000017 -13750803869111880047 -821756878 -5565348488711963913 -18940198 -23407778443822783 -811860878 -3910652327921846506 -2372569380647405649 -6151097721875664077 -8481290603310483360 -15289115311734721621 -5197393238738928914 -8858552325786961082 -15270695523793439937 -103940672 +1580068669843704469 +5565348480114297669 +72286714 +164037275 +3473534 +14942458 +356832249381 +806290300 +60817815 6206603741566403719 -151388766 -2531021385567766485 -7563081637033018620 -13044533461222491710 -6154199872212897041 -9126223058424237061 -1160107295621122785 -32714349826081871 -6152225697206437786 -4333982245204396969 -7012532 -5411521012994803182 -5249797159683425776 -570557265 -17619527108083517000 -3758799224970808644 -11069796609748044689 -210659181949 -14926165161459649868 -7570985824906512457 -3234866947851553000 -1906986264008723742 -24772943 -1873618446046923526 +12240192446106372977 +5942328186188203485 +1493907215601306869 +30740204914083091 +1873618463243635741 +1873618523431898109 +2341393787733871788 +1873618549225229533 +129168850403198609 +5728764487730398405 +10015828607276288922 +12057002331826554305 +10159392401199270768 +12142525752872799042 +2676033356038473537 +494403323033 +292346005443 +223556143025 +10911952793442192048 +820577224 +6291625 +1394017249 +98042399 +12163381674616883890 +1992588707391932346 +109394696450803010 +17760542 +545325868 +12318365723906541324 7516607870825792868 -14876921 -72221177 -18411699906768535578 -1495925747 -62325160 -288043895627 -31304259214443724 -3685635809078676834 -4980885 -313838798363 -13951205954302051853 -464309454125 -7151957518376504179 -6153353870293665804 -365428606574 -14319322726341872694 -3493083035910933027 -214957950334 -13222096480399396057 -22741311 -538837783 -12845285 -1675756474409617568 -7676326031729298383 -1873618476140594617 -70189530 -2861086850442987769 -12590629664748537952 -15501473033754248808 -1733166096 -2949238 -5833854255738587405 -6405261049027955879 -60293520 -6364097417622914469 -50397573 -15289397310941170468 -1436145094782551981 -9870724567205432 -155189878 -7996312456522828750 -2413828615876118471 -1818166298 -97845788 -2625321602296187261 +18411699880973959188 +2396555433535145871 +4074350515307087985 +18172096623373846790 +23407778443822783 +18413391992287461459 +6284197438710222140 +13819010092172884 +15636771529559117046 +530117487457144076 +7241607903360452069 +17113993018971653874 +6155045921420412650 +17869638717220195611 +8695136395555507435 +548143940 +1992881785903908578 +15741861301866530650 +18411417890365833232 +5726358144768542273 +18413110001679335503 +16149105318148965958 +6366353527348595732 +6155045912821630127 +18067928153034001057 +7888341864134085054 +8856014216854897981 +11202456151687629452 +6152225744495577231 +69272016 +631243623 +210659378559 +3541851256594893702 +23396678 +11927769 +9870724568123690 +103678524 +92209676 +17297538988880889355 +1873618501936351339 +1519546451849645365 +5438906422044199526 +2626514825137096412 +15740334315622960494 +8912366315847026281 +5461104800004441485 +326737923180 +6388664954993575829 +14264208172476401284 +14745847 +3276923 +576717661 +806093689 +1815348248 +152371807 +2625321593698126810 +9870724570940976 +72090103 +60621205 +5569296726948055802 +4502324021620638916 +2599235317101562153 +12665415616966166285 +5197393238738928914 +3701600990787603378 +9654763076979264499 +12808641794728789765 +2676033356038276482 +18412263913779363880 +3865299044899163405 +6877319166685482198 4451323549999957434 -3544953467117898450 +15287141188316891641 +17563932 +155189878 +292345808939 40501610 -6364097443417820330 -1543385872365455415 -12606726616442537392 -16436379939763522008 -7562235540534921217 -546702141 -20709678 -18413109962987470918 -10939233345785957508 -1384869222252743071 +97845788 +6095014 +63439289 +2791055594105669609 +28484146776247610 14383042897579063 -245051624454 -813630359 -5881866613803452649 -1455946274504313841 -68157888 -10813643 -4502606072414800438 -9388513432576593267 -517014256 -16739161091967945306 -6203168539198949844 -20305658202031811 -15122676476569913436 -48365955 -5941764144784016877 -12601357272775920269 -5900805793554762144 -163054228 -6155327937823509637 -95814162 -2625321606594955469 -544670501 -11092808190891527547 -6365225423046182853 -3545799490531822688 -5991347927496329957 -2676033356038473537 -6928358494714596151 -18895516000586505 -18413109993081143373 -1317798870 -3242943116712479419 -8468495303965871404 -10215782083327823122 -295544243748734701 -7536133444401891169 -13880529192106527090 -18412263930975748140 -103678524 -8816997365064994109 -5513226652957347114 -13427220419978791304 -4279895118 -2581508047683782932 -151126621 -16436648502584675667 -5245789596497153220 -18411417868870352907 -1574831104 -5512098613140196086 -16646420 -16881311723980129501 -580191075 -6750384 -460010423829 -17142588721119759321 -5411521012994540776 -13331692090551241408 -2236213724530672835 -10512763733196344280 -91750922 -493159123 -210658919829 -5353476789791099071 -2973047420892220660 -102615266471184862 -817431474 -71959029 -14614773 -29330157293667421 -18411417898964025362 -8854886129749066875 -62063012 -1631882651478526261 -1873618497636468806 -1626046306171619904 -4718737 -6971710725545264615 -15463390673086056969 -5996988225456246061 -2625321597997156982 -1258091056198584472 -2365498112266798670 -12258209558853782455 -548471621 -200191596416994196 +5199085482290645376 +1818166298 +8843457619279611284 +4076606659425995504 +10441809166173275876 +17306856587979066781 +918014392040164136 +1873618458945456185 +12237654281284815334 +1873618484738787248 +3101815889301670444 +20023641798084435 +17404805271631431757 +10554335258691243263 +7239351853821397993 +12012735189037878155 +12893049762838873864 +5565348523104797810 +4078298792234977417 +18411981923171237924 +15636380174699072146 +1440671446449589035 +112132697 +429916882098 +3822179681402096264 +1881294612915292853 +16508579235574254010 +32432389312220424 +11911353550167017696 +3493395603981665996 +15287423196122254433 +6104586261132347910 +1873618450346674286 +2848264744227112681 +681145240220010584 +6366353527348399255 +437798118096833445 +16872667624306444468 +516358893 +15515140725455259155 +527827717 +103481913 +210659181949 +1938490399 +23200068 +160825986 +69075405 +7460569593843485201 +3172873313800750756 +9870724567926898 +14383326641327005 +4452458274095237609 +1330643932 +11731158 +4025048830681353606 +4530582984922301900 +10233718743719545342 +1873618471842024407 +2487491354099451134 +15292499491369977714 +11078361536363433136 +7513578521803359945 +15662612681012938353 +2146416200305806198 +342928503145892901 +7746405201714873917 5565348480113903112 -10159392401199270768 -538575636 +60424594 +14549236 +15897908900500866947 +33560403333875446 +3080312 +1005889956380019628 +817365937 +163644058 +507708124 +71893492 +2625321593697930351 +9870724570745030 +17750109864738752812 +17419818910382754661 +15475002888547338265 +16224960573811657069 +18412827946584768572 +7880043043777809442 +27638084672424186 +5246977012853376412 +6366353484357502971 +5407707525201267705 +359800716494834349 +447360420122266222 +2676033356038079832 +12463417266756127924 +6053028402293443390 +820184006 +97649177 +63242678 +28484176870181368 +5898403 +33278412725750974 +5293539447540681024 +4504298175131421894 +6313103561496791450 +544932649 +2597848126 +17783567759008991682 +9195012299735698785 +3491703471172619422 +8722536002903082945 +1873618514832721746 +2676958769323838072 +4000515045253449269 +9298403582666148514 +5780604337176709161 +11701191021079496730 +17943317531894613402 +5299098874403555921 5782296448490211725 -15289115277341755866 -12583138 -4959080478982475006 -4237766475632413481 -2687090 -60031373 -11241814380293784908 -18413674017288355935 -10162787574158199843 -5625593289148533238 -605557034314828631 -2625321602295925195 -97583640 -16546579671803956126 -546439994 -13513914891881875478 -18412827955182960702 -18142877345697171235 -8716776878113885241 -5991347923197297866 -21715680028265805 -5299098848608717979 -2686971790050919863 -10551496 -2676033351739442523 -5246976935469649046 -4236074403011431549 -5561348123192067576 -516752111 -13525196865559988902 -451412624470 -6813843502384089093 -3452050537366752044 -2723374776553770162 -105448017 -14284319595218536933 -356832576945 -1987904546 +13965057806789381527 +3880024017885400135 +43123062 +811533196 +1133620917580466754 +17096567568145714575 +7462549834646555859 +4223816177647290930 +1873618450346477252 +9232147856523987724 +6517011693716180143 +6219166245997316001 +29330122900898936 +17782439637510195701 +16892239439269464715 +8072726556923202784 +15862817677379702068 2789363555876800106 -17063697102470777209 -6584302816815089825 -5727354422913010657 -13944415416121166662 -28311895 -11906248855590275274 -3707523343842937215 -18412827985276633157 -821232589 -18415907 -2676033356038210923 -17257283880273643533 -18331556279224644 -9117971362513815455 -18411981923171237924 -309541536868 -113312346 -46072191 -103416376 -27920126869375123 -160760449 -361131345578 -9234597529149245860 -14835085562484362568 -4585257123188181630 -1413046597527538184 -6208295874376239521 -13217980679449939250 -1966081057 -6101795981361546864 -16384272 -10370417990725208293 -4196703391028741586 -6488236 -63832509 -5153885660580611393 -6155045912821630127 -5197393273132877515 -2625321593698126810 -10720606758114626648 -9870724570745030 -30740204914804024 -91488775 -7792373120121047026 -3579577413 -5458848587100981064 -755605599842665887 -17404805271631431757 +11534547 +279448847671 +210658985303 +252379820 +539099930 +814351263 +160629376 +9870724567730368 +103285302 +68878794 +14268291611706526293 +23003457 +16876615841046071902 +16269555352897260337 +1873618446048496233 +10161648493728303880 +10430737389551487761 +13735950183222348532 +1991460714865167155 +9870724570547380 417019921504 -9386257335747873389 +572110149897422243 +2883701 817169327 -18413391979390173264 +5240852582657493474 +518980348 +60227983 +4618470194090937269 +1459422188 +232154335723 +3773122177597967623 71696881 -8328637003859953646 -14665059300281706 -6101796011455220816 -4456589 -13070886371126478108 -8733200714257204941 -10913926882465549337 -29330183088310857 -61800865 +2625321593697733840 +331038328206 +163447447 +18411699889572151318 +13044533461223476582 +18413392000885653589 +16436379939763522008 +12314601354656483126 +5539270545646881104 +11507341268100646834 +102615266471184862 +31304259214443724 +7731878007432940921 +7401639761433855789 +9450798976826149302 +5701792 +5245848925746498573 +15104099179857577674 +108921430 +17170714 +5411573444917201071 +544736038 +468609401971 +3758799224970808644 +63046067 +819987397 +18411417898964025362 +14322803714593785119 +3062219857732765097 +2207210695286917386 +10532987970627045461 +1873618458945062288 +684134257893509654 +14101293042239958 +5015996636573993090 +6098975770044925746 +850088361543927300 +157614716 +111739480 +219257572663 +19988781 +30740222110861163 +8618954206935976415 +6206321690772243584 +1873618450346281005 +5405598659939206416 +15872788267758849077 +5572365145471912335 +4625631396377398109 +17619527108083517000 +493028049 +9870724567533791 +1559626750 +19177592590632702 +91619849 +2625321602296318353 +515965674 +103088691 +68682184 +527434500 +1990332636357069057 +899112529018292807 +6570315963541227654 +16038494049631274933 +15920335005095365860 +28202143271093720 14949273699027977966 +18412263922377556010 +5782296461386581535 +13001682102565734253 +12972952285742288 +2703776923039566000 +2687090 +11775490430040476325 +14156017 +2686971790050919863 +17489156252859434801 +105906772 +60031373 +11241814380293784908 +71500270 +5734260728554980678 +5197393273133271298 +17648172288953812474 +1873618493336979326 +683965395648908415 +18331539082773742 +32150304123324262 +17422075106091075868 1873618523431110190 -3573803894998305775 -5569296709751605280 -5835546375651263675 -9870724568714358 -42008942 -1746899701160150410 -9664889374910385451 -7406761759861377295 -2625321597996894992 -365428082633 +10628953736434486617 +14512708438227029368 +18411981931769430054 +16886908734816455284 +62849456 +9131616131521448314 +5505181 +11364576519499679975 +33560368941368890 +544539427 +39911783 +33560399035500221 +533070599 +578945889 +8097653480026868144 +154600049 +16974104 +17957750408840481687 +285938493736356261 +5991347927496329957 +10377197347619277484 +15131353489256221185 +15287987228927396675 +14282027259104724924 +8070528080642443989 +17128487303121020 +219257375260 +42729843 +811139977 +6365225478934037607 +460010423829 +490013379 +13850612818404510827 +2207492698791414354 +7515799761807542411 +1873618480440216958 11888218815508973537 -6311975551774360856 -1408369638 -6101795942670075923 -15515140772745448064 -27638058877519937 -13361048879788721990 -2430665780 -22217020 -538313489 -927164962728314711 -69665238 -27638084672424186 -2573543627316201844 -12320990 -2424942 -18413392009483845719 -3660444556051220001 -18412545947378450486 -154665586 -9870724566681132 -546177847 -2229804632046437624 -5245848917148372136 -15906307047154976446 -827351178595273968 -5780604350074062990 -6350640494756627870 -9198943117821938833 -2676033351739180486 -1192315303887243384 -67633599 -6205475723246636047 -17419818910382754661 -162529937 -17083693235326683482 -105185869 -8912366315847026281 -5249797202674912471 -2446394423 -1461650414 -257426098 -17299513133793348673 -4451048243670025981 -14597841535548131734 -14130457194541352666 -15290525359355331959 -9195012299735698785 -524354306 -429916226796 -6153353788611431303 -1728578573 -6153071806602085789 -2676033356037948725 -8257735 -2785415326238575484 -1873618489038408278 -8072726556923202784 -7731878007432940921 -16271603835638319461 -11229884474259868248 -5835546388547569431 -2704904949958969710 -103154228 -2625321589399096275 -6887529782530082437 -45810044 -16365628939578247566 -4408861808311732424 -3554388240579364748 -3431353251379022211 -4131548706499659810 -3229097897723824621 -818938814 -16122124 -10831084194895235709 -6226088 -6366071472254485645 -10441809166173275876 -9538952396691934382 -5994450030541998229 -6835382734606174906 -4397798273518472097 -2625321593697864817 -9870724570481756 -17782439637510195701 -31304332299601191 -4074350515307087985 -10758418391935682553 -11405246090117384413 -196018851 -17943317531894613402 -15289397375426759758 -1801651221 -12716605781588708278 -5353476789790574588 -1873618450346936800 -14462121002204464918 -2785415309041207732 -71434733 -10770155859627543824 -1873618476141841211 -5780604362970367638 -2530739313276357975 -14090480 -5567604589840172352 -296644709200 -11266915032714840583 -4194441 -2200512120787569683 -2549492329236335496 -6211116016906930204 -99090988 -9625506809262378259 -13237708535585570818 -490103571663 -14541340640523322842 -9870724568450966 -1793158821936040552 -9486667438472824267 -21954873 -538051341 -1398211555 -5408700909154273182 -5356297014005859746 -8444237263823374707 -69403090 -2599235317101562153 -15897859265386515143 -6097847713031849822 -2162794 -9796067026192895123 -13117159209037203716 -164299420 -17088031212435737557 -8099682237308012832 -8971880411373045432 -3099205763721988894 -9870724566418979 -545915701 -13237708565679243273 -4449074137450482853 -18115860927276518423 -5247593352907982888 +10754752208794748192 +4770547841029572913 +2236213724530672835 +12085352355710699032 +2625321602296121720 +11141327 +68485573 +22610237 +279448454880 +573113178 +9870724567336570 +17097308613030775025 +45547899 +102892081 +538706709 +813958043 +2150364429944620611 +15290243360149735302 +1004761907965660269 +13427220419978791304 +18412827955182960702 +2116750858326574509 +11847668763332905754 +15530553734630409457 +6366353492955694732 +5875369269012203157 +7528870385169533979 +71303659 +816776108 +417019528294 +48365955 +2490479 +518587129 +59834762 +163054228 +9870724570154139 +5300226909920168653 +1493907215600323645 +1873618523430913566 +6293435910568086045 +4661227814082512385 +9231865831523354279 +4562124381333884039 +5994450030542718433 +8468495303965871404 +6151097734773868363 +5308570 +51184007 +16777494 +62652845 +7410231625909407077 +8336200085500660803 +10377426660276898779 +15108492788614827244 +5405598728725859379 +5349367342193968413 +1873618489038801706 +12851509922494416016 +3812049113439014303 +14151987057237756511 +10744889200825601240 +9304480456320748248 +19595563 +219257178788 +15798241638577276499 +3017170418691476659 +8857706336766199103 +13957562954616997312 +8126661 +30740222110467489 +4662355883991631380 +18413674000091971675 +6203168539198949844 +12480382642832672298 +14814059355306855043 +10229733804179524009 +6887529782530082437 +10905173350565414454 16533468055605152863 +3758799212073651272 +240752528220 +68288962 +1318388695 +6350640494756627870 +5462796894122411284 +813761433 +10944716 +2625321602295925195 +538510099 +18411699898170343448 +13880529192106527090 +18413392009483845719 +25099937047839912 +10440328872292254866 +6835382734606174906 +3686437022462641529 +816579498 +541328159 +2293868 +71107048 +105513554 +151388766 +9870724569957729 +5197393273132877515 +5882159627828332650 +7851156332894489575 +11510172448171493799 +9250794030848869727 +8327325764367420682 +16778219695595128445 +8733200714257204941 +16580883 +13513632858283052077 +1461650414 +2625321589399162008 +819397570 +8423108281783224429 +5111959 +1473119215 +257426098 +62456234 +39518566 +12622921449567619867 +10531295803425490030 +5566476498435574920 1873618458944474091 -19923244 -3188833116656765520 -2676033351738918494 -4501477955215362649 -17621268784989013395 -14581169549127125939 -6206321707968234614 -33278352538406314 -516227820 +1873618489038604579 +18613366323088485 +3008657884776564221 +11621189233770302317 +5464488953846367762 +475082778886408323 +6155327937823509637 +9956242218935388443 +11065487246536017596 +2676033351739311464 +14361095630442006439 +810746758 +283181761 +2625321610894509848 +546964288 +6365225423046182853 +7930050 +6262673798362565305 +1840738151924108521 +8483828720842049762 +9013091190501541709 +8294669686619703163 +9288263741171697848 +15520597512816297356 +2792183694107936667 +9227353569080969220 +2429880031182916564 +5833854255738521265 +18412263930975748140 +2430665780 +22217020 +18301216972082383618 +11964228788262537512 +159842942 +28766150282773591 +538313489 +813564822 +7032232753418799293 +12348736218027264207 +15290243360149343057 +6406389097441592980 +2964529530791004471 +18613559784442970 +1873618476141841211 +5991347884505041337 +6101796011455220816 +6366071455058494624 +6155045908522469290 +8412057600244518110 +3478039985316562895 +12718336251608304605 +70910437 +4211147846 +197067432 +14443179094226111164 +2192639020 +9870724569761068 +105316943 +25035091 +162661010 +518193910 +5303047078245827995 +1903640920853056624 +18092519415945890646 +4127600455366674792 +6474545510181176536 +7731877951544692100 +11084138473134491150 +2625321589398965240 +1495860210 +154010219 +16384272 +15043322265989680207 +6204347601746593065 +4915348 +62259623 +468608617008 +1966081057 +1192315299587689576 +17256155806064642777 +1873618489038408278 +12662636664722033563 +1654120425802828663 +25099894056749168 +5299098874402571932 +2676033351739114988 +489423554 +30671195 +5411521012994803182 +42140016 +7733439 +2625321610894313322 +7329667560521271617 +6206321690771457172 +5967447917778832244 +2284412389694637269 +2572415553107265488 +18412827963781152832 +16904712944498838074 +15289397349632182266 +29330122899915877 +27356081166681957 +6173800107753209956 +538116878 +10551496 +3919394969679695174 +9870724578216632 +492241614 +8816997369364548341 +4662355849599126556 +16567374854657149772 +12884708026702235763 +6364097417622914469 +1873618532029106835 +8861613698626554738 6890349946557761313 -1411918553413126104 -162267790 -2474797953316292924 -1694703987789596868 -18172096623373846790 -28766090095429261 -1223976979390989739 -3221822110943152678 -104923721 -15185362616787929146 -10003084053115964048 -2625321585100065781 -437798118096833445 -1815348248 -31304323701802109 -152371807 -14046027923586223423 -2021331689141374237 -20869691006257762 -13044533461223476582 -16778219695595128445 -12057002331826554305 -17465760298758178660 -7576852735584046364 -129168850403198609 -820708298 -17891616 -1873618489038145001 -7995587 -11911353550167017696 -4522983015860209939 -12612941966326959190 -102892081 -2625321589398833886 -45547899 -11548493110908749415 -4076606693818764590 -7851156332894489575 -12779163922391107832 -5991347884505304103 -1095239150174145285 -3863606920688567965 -10771469979967884371 -15859976 -14312864964518020808 -17245750799710423012 -5963940 -10655291933708585535 -4162099616697747321 -63308215 -1873618519131818153 -30176189305784773 +5837238474067478018 +5780604294184830225 +11214563466576463780 +29612216687004362 +5516046782590617836 +10156853965084755435 +6151097683183797493 +11613165301442872555 +1986666427421953426 +6155045882728942511 +7033448275620070919 +2907303882175415846 +1320813529 +1584595969 +105120332 +7465404271946632160 +70713826 +24838480 +162464400 +12451287838412704489 +816186278 +644154222 +3735453693364143828 +9870724569564298 +1309344723 +21715680028329254 +13044533461222491710 +1873618497636993704 +3445982126355516078 +7529998377695250462 +12237654319976351671 +4534407021717882867 +3431353251379022211 +494159375523777824 +1136798196293306910 +16426766960960540026 +819004351 +12356593998396393868 +16187661 +3307734082 +14273081993166850387 +4718737 +434977997061097911 +62063012 +2625321589398768544 +39125348 +30458248699315998 +17858552114457937219 +5903884228619468800 +16872385650894636566 +10504814416598927327 +12213481117926952067 +18413674008690163805 +14101026494875963 +4709060078846741586 +2676033351738918494 +9714916620294556051 +13237708535585570818 +810353539 +2625321610894116800 53412232 +434216307724 +7536828 +41943405 +6770804071406897080 +821822415 318140582948 -15611911946388048179 +6365225453139920066 +4502324038816629924 +4030203865610717075 +18411699906768535578 +15290807392954681807 +11966722661119888545 +8618954206934993224 +12189960762281954023 +32432333423379563 +18413392018082037849 +6004915412369541960 +14546784569801180959 +745740842898098858 +15289397293744523027 +5299098870104394759 +9257009393629660721 +5900805793554762144 +6155045917120857525 +21823800 +1317798870 +537920267 +1730675726 +1535706104 +9870724566550039 +14648423506775771515 +10531295876509927029 +3973490993339565383 +14312864964518020808 +14824583848163281869 +16940553195690134509 +1873618476141446514 +5778348218852443426 +5758903550139959418 +27356016680241600 +13940760354079114484 +5645056620059298667 +347984565637089693 +815989668 +9870724569368358 +5887799994573980828 +162267790 +517800693 +70517215 +15925946803693423456 +2625321597997353452 +16572875051796793000 +575144796 +104923721 +13172970 +14426056237756189706 +5909364179964069981 +5459976691403654584 +4397798273518472097 +27920040887059601 +1873618527730926929 +1873618467542665344 +18613585580197092 +32714392818354350 +18613499598604650 +5780604289886653255 +3865299049198390675 +22279760122415532 +18412545930182066226 +50397573 +153616999 +2625321589398571980 +1736311827 +15991050 +14665059300281706 +4522126 +7792373120121047026 +30458248699119542 +13951205954302381098 +17785259844527786731 +6444189713816225654 +747829823970020707 +8698802578697685482 +14477731067643038195 +18412263939573940270 +14318336791419094928 +15291371425760087333 +12109395139072755174 +30277976 +99090988 +282591932 +546374457 +490103571663 +15580874172203795679 +810156929 +7340217 +638124907 +259654328 +18809125 +18056758355458722638 +5679882735784101879 +7563081637033018620 +8520914754064026558 +283748271730 +67502526 +9870724566353399 +7242736046355514492 +572130134 +514786024 +214958409445 +29048192479791616 +2625321576501808484 +5354604872597767596 +29048106498198701 +2575517759332551933 +6311975551774360856 +14036340911856223966 +32150286927595340 +17291573824845253535 +14926165161459649868 12640696470018459947 -30176223702288623 -9870724570219682 -33278412725750974 -1409876968 -28766150282773591 -1873618450346674286 +17498716255300421272 +3968978683605551949 +16377260960560187819 +19177532404009207 +2625321597997156982 +24445261 +5245848878456439955 +421319345246 +5510538272098551989 +70320604 +3249068390006196153 +5888081980883929307 +1836516380 +12976359 +236453760381 +2141513421469058406 +1873618497636600365 +11878630053446878902 +6156456003434055463 +27638058877519937 +18413109962987470918 +6288511205539515238 +4770547828131824981 +4160689491693538063 +14836382508930370955 +12751507524739009261 +10427987387505837891 +2605266760616185153 +2524806001290315567 +33560429128451329 +4325515 +669516658 +15794439 +807142269 +5303047104041388600 +818611132 +61669791 +12644080653952551280 +6045857707735386835 +11229983338076703492 +2845029447323880298 +18412827972379344962 +6767393152337644543 +2673382969485886910 +15185362616787929146 +17490170188584258190 +4047541379259827663 +15680489859993767209 +546177847 +7143606 +637928298 +7276444624641068235 +12287601267178473523 +31022238513759415 +17698252132056434004 +1732546160493595960 +7036226112429884975 +2676033644081056812 +548995910 +90243587 +571933524 +812778389 +9870724566156739 +214958212644 +1873618446046923526 +3493083035910933027 +15291935501556190620 +14650572868605052119 +6971710725545264615 +17302333254828493968 +6098975847429179176 +4504298213822565083 +505938649 +3579577413 +2786543383251193103 +70123993 +47186305 +2352415791 +4279174221 +2625321597996960522 +1538130937 +161874570 +17082847207615236134 +6206321707968234614 +8854886129749066875 +10908568553618343357 +2785415326238639918 +1873618527730534170 +1873618441748940565 +5745384143842643344 +18413674017288355935 +16044698410491643447 +9181531069949872018 +10905173367761798655 +13237708544183762948 +3757107087862401328 +1311572948 +2034107431 +15597828 +2538734651 +5727354392818878727 +4128904 +818414521 +95879699 +5727354422913010657 +5245848874158263187 +9664889374910385451 +18411699915366727708 +14851060135220743194 +17958290734101235336 +9319686106503382840 +89657146100418951 +11349795265056081195 +14540810596246030644 +5779476284463187670 +18415907 +156041850 +259261111 +821232589 +809763710 +98697768 +6946995 +5941764153383128192 +17684252729367202593 +10233694917695638297 +970700105235760464 +21715753112570631 +17953636526298302297 +6262673798361580735 +5847102830955857465 +3313969578832561394 +2974323816123992770 +13271165719268362246 +17083693200934636558 +6101795934071424788 +16990917635978692369 +812581780 +16327183209838150989 +21233971 +1535116279 +214958016090 +2625321606595545096 +3232498753 +1500709877 +514392806 +5831598146013367591 +4502324004423927097 +3099205763721988894 15290243360148359553 -14036340911856223966 -6365225461738636619 -816645035 -417019398489 -6206321673575531611 -12057284352529139627 -71172585 -13828334 -7528870385169533979 -5832726134240118664 -2785415334835848520 -2572415553107265488 +1873618476140856959 +3295137431799204142 +14130457194541352666 +8910392170935354895 +3967850626592737364 +18412545938780258356 +12583138 +505742040 +4278977611 +540148509 +24052042 +196084388 +563086155 +104333894 +2625321597996763849 +16324853745603185849 +13586095437453200186 +15804734059994287439 +18005251247539029895 +13516735047310051359 +3493677603186412637 +10159956468397444373 +5249797099496672683 +17763448248357489818 +18412263948172132400 61276571 +7630443591734791098 3932293 -9870724568188981 +72745468 +95683088 +15401217 +4076606693818764590 +15986098390340470919 +1873618519131556994 +9386257309953099582 +8501910827968825512 +168849237244054062 +6750384 +545784627 +2625321585100000596 +1652810939277510396 +580191075 +98501157 +5198803303557629187 +3297856681506178941 +3935742187522887052 +2601013084734032090 +11500631658516907905 +8021450341214588326 +14977809576148535095 +4127600472563058730 +16965951797418331227 +27356081165698156 +491258567 +12804866717273491655 +1408762855 +2573543666009114673 +2200512120787569683 +2625321606595348609 +21037361 +14462121002204464918 +5619444426388998007 +3973491023432910866 +12103109825679658143 +7260902865540482639 +5566476571519223063 +18413109971585663048 +17791918762976347730 +16365628939578247566 +4449074137450482853 +11214563466575480865 +7239069803025663720 +17952462371364276975 +9512531412808567772 +11075097734987253589 +2373415502940997016 +16874702537456224943 +517014256 +2573543627316201844 +4278781002 +69730775 +9870724568582655 +12386527 +12743882002561631754 +10906583475570214623 +104137283 +35324256 +10167863869407233224 +18412827980977537092 +363084051790629688 +11694336983993944146 +1873618441748546884 +32432320525830439 +12654580528992553525 +7241043922144659849 +9391897706793274792 +152830562 +1402930148 +164299420 +5303047073946667464 +3735682 +61079961 +15204606 1873618549225491555 -2360543918673038210 -98828841 -12512221777814685432 -17939922315943150958 -6045857707735386835 -21692726 -4502324038816629924 -11490081257974859839 -17639632887023929831 -1316357237551401394 -6101795994259359091 -11796695 -69140942 -18411699889572151318 -12074216556992400767 -1320813529 -8618954206934993224 -164037275 -4160546838840674266 -12591757708863407913 -555549513 -9870724566156739 -154141293 -32714414313178248 -545653553 -223556471268 -12613788024133322735 -812581780 -5778348150066318224 -1500709877 -6741138607599781046 -9227353569080969220 -515965674 -13884327378110449525 -18411699919665823773 -16340493341965880015 -162005644 -620757861 -21997756618049241 -17007720368052373541 -13001845694847518363 -227855238971 -17629469 -1737950228 -9288263741171697848 -20305615210743190 -1873618489037883086 -18613533990193666 -7733439 -313841551493 -15288551330518206781 -17302333254828493968 -6153071832396467338 +3188833116656765520 +31586327206235137 +820839372 +464309454125 +18022689 +545588016 +17205553309096938840 +313838798363 +223556406340 +98304546 +15463390673086056969 +4240022615453076686 +10831084194895235709 +11549275701007551889 +155648632 +6553773 +534119176 +4222974949961697922 +8326286517935867839 +1873618454645114642 +1146796961722731823 +5509410202188647833 +1873618514833377412 +3242943116712479419 +29330157293667421 +8882845388820581451 +12608147700378373379 +14465116522071263669 +5461104757014004985 +9649086479758069023 +2625321606595152102 +513999587 +20840752 +2148672322930150296 +10646954815923686447 +10831360821402142464 +313841615983 +10139438201185111279 +16881311723980129501 +18413674025886548065 +2785415274648570354 +5353476789791099071 2979056014524680527 -8857706336766199103 -2625321589398571980 -45285754 -5991347884505041337 -4502324004423927097 -16874702537456224943 -14911447610171655366 +6366071515245381876 +8610102806501591788 +10333839787251271664 +13237708552781955078 +451412690018 +16101055855214332856 +9870724568385196 +12189916 +23658823 +195691169 +5155859771100236117 +69534164 +35127645 +103940672 +11069796609748044689 13944990587222231178 -3308118261903721908 -18413109975884759113 -8412057600244518110 -15597828 -2538734651 -818414521 -17082847207615236134 -18276979644936029994 -5701792 -63046067 -5882159696614657105 -1410790466305853323 -18412263913779363880 -32714379920475611 -539325825270679628 -1873618519131556994 -13536993689470216 -9870724569957729 -43254135 -5153885686374731086 -9387385384162626351 -8336200085500660803 -5303047104041388600 -5512098595943810546 -5717788221838658971 -2324121364801391676 -12012735189037878155 -2192639020 -1873618476141316771 -70910437 -3670145 -2219404100148201532 -2544580112253650683 -61014424 -6155045921420412650 -18412263943873036335 -1873618549225229533 -9870724567926898 -98566694 +27920101074341046 +17298949057997047589 +2908260051937332390 +6364097413323754682 +12350988444867431112 +1223976979390989739 +5782296431293302176 +11098517635487303139 +13525196865559988902 +2374936041605498895 +15007995 +1574765567 +519635711 +5831598103022077418 +576979807 +817824692 +634323816 +3539071 +2446394423 +6206321673575531611 +2360543918673038210 +27638024484621167 +11340219378265033230 +6366071472254485645 +4562124351240801677 29894215892535509 -155910777 -6366353527348399255 -9956242218935388443 -31586340104504804 -219257441372 -13522668389390157414 -18411417881767641102 -11534547 -279448847671 -7242736046355514492 -68878794 -814351263 -1192315299587689576 -2524775482 -34124461934314600 -507839197 -5539270545646881104 -4974759074281293673 -5337229686545450161 -153879145 -12644080653952551280 -30458205707308380 +6153353844499089111 +13070886371126478108 +9181481831755875838 +18067928196024961188 +6981729909914862956 +63701435 +6357162 +15288269305517836796 +17299513133793348673 545391405 -17877509356004052233 +17826079 +820642761 +98107936 +8854886172739175692 +9082058141968173941 +1873618484739049815 +11514789185312918199 +5778348197355914873 +11130039777759856047 +294416195335096411 +846140170598090257 +2571498445011814318 +18412545947378450486 +1408369638 +2625321606594955469 +5245848947242502849 +365428082633 +5245848917148372136 +10859426818132543221 +15524263781940136850 +2578187325 +17564225130023161250 +811991951 +1694703987789596868 +1873618450346936800 +12105446909435186010 +14975681650483333306 +32432303330887118 +29612220986426501 +11644189250161151139 17520266449292560845 -11065487246536017596 -2011949215506761725 -6155045882728942511 -812319634 -1130753852548581517 -573047641 -5299098874402571932 -18413674000091971675 -18331556280207363 -17269866578628118199 -15289397293744523027 -161743496 -10649664295314066054 -6051485356288903427 -4347925833116091776 -30458188511970924 -104399431 -10184384893691038634 -7401639761433855789 -1308623824 -563151692 -2625321610894444316 -7239069803025663720 -11434534198373320614 -1873618441748613384 -5622264654903379074 -29330122899915877 -15636380174699072146 -820184006 -2597848126 -10233694917695638297 -14585410861575638263 -7471291 -85348920764927349 -6366353492955694732 -18413674030185644130 -4127600472562141528 -35127645 -5780604337176709161 -541328159 -2524806001290315567 -13850612818404510827 -18412827968080248897 -15335680 -3493395603981665996 -17858552114457937219 -62783919 -3875793754648151904 -5564423899624572258 -292345154665 -3489447322753895731 -18411981905974853664 -5439644 -42991988 -9870724569695611 -12269921124804135698 -559088458 -33278386930321618 -15289397353931868100 -214958409445 -6219166245997316001 -15289397379726773461 -30458248699315998 -23200068 -12163381674616883890 -70648289 -9000175594581527004 -806224763 -89657146100418951 -15475002888547338265 -3407997 -60752278 -18411981936068526119 -14267039342724252928 +92275213 +335336768790 +69337553 +7290339324003420579 +17621268802185464283 +161088132 +9870724568188981 +516621038 +11993306 +507299956084 +210659444315 +103744061 +13151687854617134836 +8659114857360722535 +825323275339564903 +103179363763488430 +684134210602468610 +1873618501936418049 +6205475723246636047 +5516046752497929091 +15885957841278600403 +2477484405147109478 +16875205763331852041 +72155640 +472907842721 +14471968401314024391 +806159226 +1712194570 +576783198 +1815413785 +2446197814 +14811384 +507970270 +8929038315166239946 +3342460 +3220426554520570467 +2625321593698192308 +5677488692584514734 +21433663625497129 +2435475427475262665 +16940455997476965252 +6153071806602085789 +5865888353649363875 +17465760298758178660 +13263754581809432790 +8716776809328151764 +13112992413209136128 +6153353788611431303 +3784724792312663401 +12590629664748537952 +2676033356038342054 +14219872676477209184 +11327137566841769230 +63504826 +97911325 +9339868219275806468 13726068525522684375 -1873618527730862181 -4504298213822565083 -155648632 -98304546 -9870724567665640 -13681696359428851594 -219257178788 -24535844054893958 -50011031689890353 -10532987940533372886 -11272401 -23407795639356361 -68616647 -814089116 -15635925519041823968 -1998521381 -163512984 -797977540607610221 -32150286927595340 -4709060078846741586 -5967447917778832244 -5885976078596834724 -2625321606595414132 -153616999 -1744643526947965735 -17461812017531651650 -987047180239768912 -30740239306197230 -15288833278135765839 -525337347 -5885976155981547843 -18413391992287461459 -10532987970627045461 -56689033 -5722409915131627177 -114033243 -10159956468397444373 -18412545930182066226 -5349367342193968413 -13819010092172884 -104137283 -17953636526298302297 -2224234517276395067 -2789363555875490728 -2625321610894182276 -12426051065400527122 -9355193091131312182 -30740222110861163 -14361095630442006439 -3137288237381257087 -17105177 -819921860 -7209143 -1727529996 -810025856 -805679481429165719 -17298949057997047589 -21997713627284659 -16120716880803858984 -33560368941433940 -1535706104 -10229733804179524009 -18412545960275738681 -9714916620294556051 -4078298775038527628 -5461104765611607541 -210659378559 -92209676 -13418544886826534789 -14264208172476401284 -1917322269 -197001895 -24969554 -5405598728725530322 -15073532 -817890229 -72417787 -1873618471842024407 -17091318705916150977 -5946696443085589628 -5177496 -5847102830955857465 -62521771 -1873618523431831649 +2011949215506761725 +1737950228 +6160551 +9830100417878166271 +155255415 +17629469 +8140646021471143544 +545194794 +8510103668314541335 +18411417868870352907 5835546371351184527 -14824583848163281869 -42729843 -9870724569433729 -5780604315680310424 -16385074671182940805 -214958147231 -3007753865419557454 -491586249 -17943317531893566468 -1801912319444323213 -22937920 -539034393 -27356055371580547 -1873618476140792146 -5198803303557629187 -6103488088376871190 -13041896 -1733362705 -70386141 -2306802734 -643826540 +18413109980183855178 +5249797172580910311 +10532987940533372886 +32714379920409891 +1873618514832984063 +13702827714901707594 +29330157293274228 +220421203678071202 +5565348467217401524 +313841222762 +570950482 +13012951802393594980 +6209141854797957852 +5717788221838658971 +5460499872422693597 +8444237263823374707 +2544580112253650683 +32432303330691092 +14986955239351847842 +4392112055939237960 +16285378285009240167 +6205475671656957491 +11266915032714840583 +15289397375426759758 +17284241873202253123 +1783548230307677653 +195297952 +69140942 +23265605 +11796695 +210659247559 +17257283845880874759 +451412296787 +92078603 +160891523 +539362075 +103547450 +9870724567992379 +11331649863678691999 +12613788024133322735 +13944415416121166662 +15895039144349470066 +8816997365064994109 +1732546121802517809 +13221120945827219803 +3863606942184311140 +12562453432512743836 +7562235583526800081 +9870724570810095 +71959029 +232154598652 +14614773 3145849 -14637903957824965363 519242494 -60490131 +2625321593697995819 +1133620930477295468 +817431474 805962615 -5784522635265967958 -1873618527730601376 -18301216972082383618 -11644189250161151139 -2625321602296383846 -9870724567402585 -98042399 -15741861301866530650 -494403323033 -6729754102968812754 -546898751 -6208295835683456476 -33560403333875446 -14409153078548760239 -15530271666638163275 -1873618458945456185 +4131548706499659810 +60490131 +503001777494 +6206321673575138470 +1258091056198584472 +3573803894998305775 +10967349376607587326 +1873618523431569790 +6153071806601889790 +12749251354825264418 +9625506809262378259 +2676033356038145381 +15635925519041823968 +5885976078596834724 +9484411285755463284 +532291916112267238 +18411981901675757599 +1703347206 +33560368941827284 +5303047039553965447 +40370537 +97714714 +155058804 +6261263733545503259 +5963940 +63308215 +1130753852548581517 +5570988833963444820 +18157949162008873831 +8021450371307931626 +2861086850442987769 +1873618489039455401 +18413674034484740195 +1873618458945324208 +32714349826081871 +18424247431471827427 +1842511416005692464 +6589396841525218018 +5782296448490276391 +13237708561380147208 +27356055371580547 +5462796868326918190 +1860700038481053299 +5458848587100981064 +3580814869236944221 +5566476545725106758 +28202091681875145 +5915592292141435844 +11434534198373320614 +15740733274947783803 +10161648502327149991 +15287141235608259625 +12779163922391107832 +68944331 +814416800 +1823671323 +23068994 +210659050964 +46006654 +516227820 +11600084 +103350839 +361129707266 +13750803869111880047 +103179363763095696 +1873618501936022824 +2933734509745341832 +7230168968130792223 +14517406836956661503 +17619012718254098754 +12406930521299355297 +4408861808311732424 +2949238 +9870724570613070 +60293520 +503001580666 +14947075702982868 +1998521381 +2625321593697799226 +14418163 +163512984 +71762418 +5722409915131627177 +11599686562536949325 +1873618493337242815 16951650337051970851 -5144036663261072615 -813826970 -12133908888583014197 -68354499 -11010253 -279448324634 -14749580058850363919 -6633286351216577743 -2089265852158774334 -8929038315166239946 -31586271318836879 -13678484518713821516 -105906772 -96010773 -2625321606595152102 -153354852 -10831360821402142464 -5652457623480305518 -8503320935775669540 -16483453074211931840 -363084051790629688 -544867112 -258146996 -5944020284604679310 -5782296431293302176 -28484176870181368 -23407778443758207 -3973491023432910866 -5778348175860436286 -1873618514834032208 -5438906422044199526 -103875135 -7697026996393675938 -1709507593 -161219206 -13237708548482859013 -3701601059573925529 -879419277503368073 -3822179681402096264 +2676033356037948725 +18412545955976642616 5565348445721659362 -532291916112267238 -256115374 -1460339693 -13351948495571782591 -14665351642484132 -3008657884776564221 -2341393787733871788 -16904712944497920326 -3967850626592737364 -16843031 -4131548702199581670 -6946995 -809763710 -1928986057181235415 -11964228788262537512 -2989761681675848960 -1873618519132801026 -7276444624641068235 -5994450030542718433 -12284124821458521275 -111739480 -4076606646528706921 -13650504529854072320 -15804734059994287439 -14425661019905001872 -2395604016 -14465116522071263669 -210659116497 -15290243360149343057 -15777957523720635747 -10167863869407233224 -18331517588211470 -12884708026702235763 -14811384 -72155640 -7042731044489660311 -15288269305517836796 -5675796551176948530 -14264208198271043974 -1495860210 -5787083718919720300 -25099894056749168 -683965395648908415 -62259623 -4915348 -12974919760129952993 -6155045917120857525 -1873618523431569790 -9013091190501541709 -4392112055939237960 -2625321597997353452 -15897908900500866947 -6177363174264606048 -15872788267758849077 -491324104 -33560399034844286 -22675774 -17542946455516547053 -2431124533 -538772246 -27920040887322186 -8704274751914773568 -12085352355710699032 -6153353775713551670 -70123993 -27356081166223293 -7885152524183078888 -60227983 -2883701 -11700344903086704893 -7329667560521271617 -518980348 -5833854255738521265 -8618954206935976415 -3901910077209972079 -1713308683 -1992881785903908578 -4530582984922301900 -16130159995999161574 -155124341 -2625321602296121720 -1884114794138700522 -5778348218852443426 -97780251 -4240022615453076686 -6097847786116483627 -6361518319333476776 -30540122 -28484146776247610 -546636604 -5741055947585816645 -6100103891543657570 -8807886331112851129 -813564822 -10223260478367337870 -746324852 -15287423226215073909 -11226550812567014265 -1491796976 -8097653480026868144 -5995296157134227520 -1873618532029106835 -1539245050 -48300418 -331037869860 -95748625 -6314795724398267312 -5888081980883929307 -544604964 -34124418943289166 -5245848947242502849 -32432363517642192 -2676033356038407648 -811533196 -1317733333 -8920676095134336910 -17149817495305717193 -918014392040164136 -103612987 -8695136395555507435 -18349504802666319185 -14847634415788362123 -1584661506 -4287350266942457603 -525512494730316455 -5881302580997523790 -1574765567 -3784125305237867347 -819397570 -8326286517935867839 -16149105318148965958 -16580883 -6684847 -18411699902469439513 -11229983338076703492 -15292499491369977714 -339635406848 -9870724570940976 +5767329 +5250413516934022944 +97518103 +63111604 +579208034 +544801575 +17236251 +258081459 +17953567922355439196 +30458188512103543 +15287987228927658628 +4930631980557601532 +20305658202031811 +2120987217453057458 +6209987959894902621 +7151957518376504179 +12552846396214610071 +1793158821936040552 +5461104787107351969 +559088458 +14386655907412249373 +547619651 +2141783083 +12606726616442537392 +1923875870 +811402123 +570557265 +42991988 100 101 102 diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py new file mode 100644 index 00000000000..a8c5ae3b6a3 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"]) +def with_nulls(request): + return request.param + + +@pytest.mark.parametrize("nrows", [30, 300, 300_000]) +@pytest.mark.parametrize("nkeys", [1, 2, 4]) +def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + rng = np.random.default_rng(seed=0) + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [rng.integers(100, size=nrows) for _ in key_names] + value = rng.integers(-100, 100, size=nrows) + df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) + if with_nulls: + for key in key_names: + df.loc[df[key] == 1, key] = None + with cudf.option_context("mode.pandas_compatible", True): + got = df.groupby(key_names, sort=False).agg({"value": "sum"}) + expect = ( + df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) + ) + assert_eq(expect, got, check_index_type=not with_nulls) diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini deleted file mode 100644 index d05ba9aaacc..00000000000 --- a/python/cudf/cudf/tests/pytest.ini +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -[pytest] -markers = - spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON` -xfail_strict = true -filterwarnings = - error - ignore:::.*xdist.* - ignore:::.*pytest.* - # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning - # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ - ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning - # PerformanceWarning from cupy warming up the JIT cache - ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning - # Ignore numba PEP 456 warning specific to arm machines - ignore:FNV hashing is not implemented in Numba.*:UserWarning -addopts = --tb=native diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index cea86a5499e..691da224f44 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -266,3 +266,25 @@ def test_pandas_compatible_non_zoneinfo_raises(klass): with cudf.option_context("mode.pandas_compatible", True): with pytest.raises(NotImplementedError): cudf.from_pandas(pandas_obj) + + +def test_astype_naive_to_aware_raises(): + ser = cudf.Series([datetime.datetime(2020, 1, 1)]) + with pytest.raises(TypeError): + ser.astype("datetime64[ns, UTC]") + with pytest.raises(TypeError): + ser.to_pandas().astype("datetime64[ns, UTC]") + + +@pytest.mark.parametrize("unit", ["ns", "us"]) +def test_astype_aware_to_aware(unit): + ser = cudf.Series( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)] + ) + result = ser.astype(f"datetime64[{unit}, US/Pacific]") + expected = ser.to_pandas().astype(f"datetime64[{unit}, US/Pacific]") + zoneinfo_type = pd.DatetimeTZDtype( + expected.dtype.unit, zoneinfo.ZoneInfo(str(expected.dtype.tz)) + ) + expected = ser.astype(zoneinfo_type) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 979c936a182..af9a6c7e696 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -33,7 +33,7 @@ def __array_function__(self, *args, **kwargs): missing_arrfunc_reason = "NEP-18 support is not available in NumPy" -np.random.seed(0) +rng = np.random.default_rng(seed=0) @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) @@ -49,7 +49,7 @@ def __array_function__(self, *args, **kwargs): ], ) def test_array_func_cudf_series(func): - np_ar = np.random.random(100) + np_ar = rng.random(100) cudf_ser = cudf.Series(np_ar) expect = func(np_ar) got = func(cudf_ser) @@ -74,7 +74,7 @@ def test_array_func_cudf_series(func): ], ) def test_array_func_cudf_dataframe(func): - pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) + pd_df = pd.DataFrame(rng.uniform(size=(100, 10))) cudf_df = cudf.from_pandas(pd_df) expect = func(pd_df) got = func(cudf_df) @@ -91,7 +91,7 @@ def test_array_func_cudf_dataframe(func): ], ) def test_array_func_missing_cudf_dataframe(func): - pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) + pd_df = pd.DataFrame(rng.uniform(size=(100, 10))) cudf_df = cudf.from_pandas(pd_df) with pytest.raises(TypeError): func(cudf_df) @@ -105,7 +105,7 @@ def test_array_func_missing_cudf_dataframe(func): ], ) def test_array_func_cudf_index(func): - np_ar = np.random.random(100) + np_ar = rng.random(100) cudf_index = cudf.Index(cudf.Series(np_ar)) expect = func(np_ar) got = func(cudf_index) @@ -125,7 +125,7 @@ def test_array_func_cudf_index(func): ], ) def test_array_func_missing_cudf_index(func): - np_ar = np.random.random(100) + np_ar = rng.random(100) cudf_index = cudf.Index(cudf.Series(np_ar)) with pytest.raises(TypeError): func(cudf_index) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 5acdf36de80..17ef033ea9e 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -600,12 +600,12 @@ def test_avro_reader_multiblock( else: assert dtype in ("float32", "float64") avro_type = "float" if dtype == "float32" else "double" - np.random.seed(0) + rng = np.random.default_rng(seed=0) # We don't use rand_dataframe() here, because it increases the # execution time of each test by a factor of 10 or more (it appears # to use a very costly approach to generating random data). # See also: https://github.com/rapidsai/cudf/issues/13128 - values = np.random.rand(total_rows).astype(dtype) + values = rng.random(total_rows).astype(dtype) bytes_per_row = values.dtype.itemsize # The sync_interval is the number of bytes between sync blocks. We know diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 2e8519509e2..71b6bbd688d 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -2,7 +2,6 @@ import decimal import operator -import random import warnings from itertools import combinations_with_replacement, product @@ -179,7 +178,13 @@ @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("binop", _binops) -def test_series_binop(binop, obj_class): +def test_series_binop(request, binop, obj_class): + request.applymarker( + pytest.mark.xfail( + binop is operator.floordiv, + reason="https://github.com/rapidsai/cudf/issues/17073", + ) + ) nelem = 1000 arr1 = utils.gen_rand("float64", nelem) * 10000 # Keeping a low value because CUDA 'pow' has 2 full range error @@ -187,13 +192,15 @@ def test_series_binop(binop, obj_class): sr1 = Series(arr1) sr2 = Series(arr2) + psr1 = sr1.to_pandas() + psr2 = sr2.to_pandas() if obj_class == "Index": sr1 = Index(sr1) sr2 = Index(sr2) + expect = binop(psr1, psr2) result = binop(sr1, sr2) - expect = binop(pd.Series(arr1), pd.Series(arr2)) if obj_class == "Index": result = Series(result) @@ -204,7 +211,8 @@ def test_series_binop(binop, obj_class): @pytest.mark.parametrize("binop", _binops) def test_series_binop_concurrent(binop): def func(index): - arr = np.random.random(100) * 10 + rng = np.random.default_rng(seed=0) + arr = rng.random(100) * 10 sr = Series(arr) result = binop(sr.astype("int32"), sr) @@ -223,8 +231,9 @@ def func(index): @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("nelem,binop", list(product([1, 2, 100], _binops))) def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): - arr = np.random.random(nelem) - rhs = random.choice(arr).item() + rng = np.random.default_rng(seed=0) + arr = rng.random(nelem) + rhs = rng.choice(arr).item() sr = Series(arr) if obj_class == "Index": @@ -247,10 +256,11 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): "lhs_dtype,rhs_dtype", list(product(_int_types, _int_types)) ) def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): - arr1 = (np.random.random(100) * 100).astype(lhs_dtype) + rng = np.random.default_rng(seed=0) + arr1 = (rng.random(100) * 100).astype(lhs_dtype) sr1 = Series(arr1) - arr2 = (np.random.random(100) * 100).astype(rhs_dtype) + arr2 = (rng.random(100) * 100).astype(rhs_dtype) sr2 = Series(arr2) if obj_class == "Index": @@ -271,8 +281,9 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): "dtype", ["int8", "int32", "int64", "float32", "float64", "datetime64[ms]"] ) def test_series_compare(cmpop, obj_class, dtype): - arr1 = np.random.randint(0, 100, 100).astype(dtype) - arr2 = np.random.randint(0, 100, 100).astype(dtype) + rng = np.random.default_rng(seed=0) + arr1 = rng.integers(0, 100, 100).astype(dtype) + arr2 = rng.integers(0, 100, 100).astype(dtype) sr1 = Series(arr1) sr2 = Series(arr2) @@ -438,9 +449,10 @@ def test_str_series_compare_num_reflected( def test_series_compare_scalar( nelem, cmpop, obj_class, dtype, use_cudf_scalar ): - arr1 = np.random.randint(0, 100, 100).astype(dtype) + rng = np.random.default_rng(seed=0) + arr1 = rng.integers(0, 100, 100).astype(dtype) sr1 = Series(arr1) - rhs = random.choice(arr1).item() + rhs = rng.choice(arr1).item() if use_cudf_scalar: rhs = cudf.Scalar(rhs) @@ -465,9 +477,9 @@ def test_series_compare_scalar( @pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128]) @pytest.mark.parametrize("lhs_nulls,rhs_nulls", list(product(_nulls, _nulls))) def test_validity_add(nelem, lhs_nulls, rhs_nulls): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # LHS - lhs_data = np.random.random(nelem) + lhs_data = rng.random(nelem) if lhs_nulls == "some": lhs_mask = utils.random_bitmask(nelem) lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem] @@ -478,7 +490,7 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls): else: lhs = Series(lhs_data) # RHS - rhs_data = np.random.random(nelem) + rhs_data = rng.random(nelem) if rhs_nulls == "some": rhs_mask = utils.random_bitmask(nelem) rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem] @@ -525,8 +537,9 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls): ) def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): nelem = 10 - lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) - rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) + rng = np.random.default_rng(seed=0) + lhs = (rng.random(nelem) * nelem).astype(lhs_dtype) + rhs = (rng.random(nelem) * nelem).astype(rhs_dtype) sr1 = Series(lhs) sr2 = Series(rhs) @@ -550,8 +563,9 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): ) def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): nelem = 5 - lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) - rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) + rng = np.random.default_rng(seed=0) + lhs = (rng.random(nelem) * nelem).astype(lhs_dtype) + rhs = (rng.random(nelem) * nelem).astype(rhs_dtype) sr1 = Series(lhs) sr2 = Series(rhs) @@ -574,8 +588,7 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): ) def test_series_reflected_ops_scalar(func, dtype, obj_class): # create random series - np.random.seed(12) - random_series = utils.gen_rand(dtype, 100, low=10) + random_series = utils.gen_rand(dtype, 100, low=10, seed=12) # gpu series gs = Series(random_series) @@ -631,8 +644,7 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class): cpu_func, gpu_func = funcs # create random series - np.random.seed(12) - random_series = utils.gen_rand(dtype, 100, low=10) + random_series = utils.gen_rand(dtype, 100, low=10, seed=12) # gpu series gs = Series(random_series) @@ -774,7 +786,8 @@ def test_df_different_index_shape(df2, binop): @pytest.mark.parametrize("op", [operator.eq, operator.ne]) def test_boolean_scalar_binop(op): - psr = pd.Series(np.random.choice([True, False], 10)) + rng = np.random.default_rng(seed=0) + psr = pd.Series(rng.choice([True, False], 10)) gsr = cudf.from_pandas(psr) assert_eq(op(psr, True), op(gsr, True)) assert_eq(op(psr, False), op(gsr, False)) @@ -923,16 +936,17 @@ def test_operator_func_dataframe(func, nulls, fill_value, other): num_cols = 3 def gen_df(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() from string import ascii_lowercase - cols = np.random.choice(num_cols + 5, num_cols, replace=False) + cols = rng.choice(num_cols + 5, num_cols, replace=False) for i in range(num_cols): colname = ascii_lowercase[cols[i]] data = utils.gen_rand("float64", num_rows) * 10000 if nulls == "some": - idx = np.random.choice( + idx = rng.choice( num_rows, size=int(num_rows / 2), replace=False ) data[idx] = np.nan @@ -954,21 +968,21 @@ def gen_df(): @pytest.mark.parametrize("nulls", _nulls) @pytest.mark.parametrize("other", ["df", "scalar"]) def test_logical_operator_func_dataframe(func, nulls, other): - np.random.seed(0) num_rows = 100 num_cols = 3 def gen_df(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() from string import ascii_lowercase - cols = np.random.choice(num_cols + 5, num_cols, replace=False) + cols = rng.choice(num_cols + 5, num_cols, replace=False) for i in range(num_cols): colname = ascii_lowercase[cols[i]] data = utils.gen_rand("float64", num_rows) * 10000 if nulls == "some": - idx = np.random.choice( + idx = rng.choice( num_rows, size=int(num_rows / 2), replace=False ) data[idx] = np.nan @@ -977,8 +991,12 @@ def gen_df(): pdf1 = gen_df() pdf2 = gen_df() if other == "df" else 59.0 - gdf1 = cudf.DataFrame.from_pandas(pdf1) - gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0 + gdf1 = cudf.DataFrame.from_pandas(pdf1, nan_as_null=False) + gdf2 = ( + cudf.DataFrame.from_pandas(pdf2, nan_as_null=False) + if other == "df" + else 59.0 + ) got = getattr(gdf1, func)(gdf2) expect = getattr(pdf1, func)(pdf2)[list(got._data)] @@ -3413,3 +3431,16 @@ def test_binop_eq_ne_index_series(data1, data2): expected = gi.to_pandas() != gs.to_pandas() assert_eq(expected, actual) + + +@pytest.mark.parametrize("scalar", [np.datetime64, np.timedelta64]) +def test_binop_lhs_numpy_datetimelike_scalar(scalar): + slr1 = scalar(1, "ms") + slr2 = scalar(1, "ns") + result = slr1 < cudf.Series([slr2]) + expected = slr1 < pd.Series([slr2]) + assert_eq(result, expected) + + result = slr2 < cudf.Series([slr1]) + expected = slr2 < pd.Series([slr1]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index cd1ad21ae59..db41f689255 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -252,10 +252,10 @@ def test_cat_series_binop_error(): @pytest.mark.parametrize("num_elements", [10, 100, 1000]) def test_categorical_unique(num_elements): # create categorical series - np.random.seed(12) + rng = np.random.default_rng(seed=12) pd_cat = pd.Categorical( pd.Series( - np.random.choice( + rng.choice( list(string.ascii_letters + string.digits), num_elements ), dtype="category", @@ -279,12 +279,10 @@ def test_categorical_unique(num_elements): @pytest.mark.parametrize("nelem", [20, 50, 100]) def test_categorical_unique_count(nelem): # create categorical series - np.random.seed(12) + rng = np.random.default_rng(seed=0) pd_cat = pd.Categorical( pd.Series( - np.random.choice( - list(string.ascii_letters + string.digits), nelem - ), + rng.choice(list(string.ascii_letters + string.digits), nelem), dtype="category", ) ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 4aa7fb27c9b..65947efc2df 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -31,12 +31,13 @@ @pytest.fixture(params=dtypes, ids=dtypes) def pandas_input(request): dtype = request.param - rng = np.random.default_rng() + rng = np.random.default_rng(seed=0) size = 100 def random_ints(dtype, size): dtype_min = np.iinfo(dtype).min dtype_max = np.iinfo(dtype).max + rng = np.random.default_rng(seed=0) return rng.integers(dtype_min, dtype_max, size=size, dtype=dtype) try: @@ -154,7 +155,9 @@ def test_column_slicing(pandas_input, offset, size): [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype], ) def test_decimal_column_slicing(offset, size, precision, scale, decimal_type): - col = cudf.core.column.as_column(pd.Series(np.random.rand(1000))) + col = cudf.core.column.as_column( + pd.Series(np.random.default_rng(seed=0).random(1000)) + ) col = col.astype(decimal_type(precision, scale)) column_slicing_test(col, offset, size, True) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 8da589ba45b..ab0f1767cd6 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -30,6 +30,7 @@ def _hide_concat_empty_dtype_warning(): def make_frames(index=None, nulls="none"): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { "x": range(10), @@ -51,7 +52,7 @@ def make_frames(index=None, nulls="none"): df2.y = np.full_like(df2.y, np.nan) if nulls == "some": mask = np.arange(10) - np.random.shuffle(mask) + rng.shuffle(mask) mask = mask[:5] df.loc[mask, "y"] = np.nan df2.loc[mask, "y"] = np.nan @@ -203,10 +204,9 @@ def test_concat_misordered_columns(): @pytest.mark.parametrize("axis", [1, "columns"]) def test_concat_columns(axis): - pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3]) - pdf2 = pd.DataFrame( - np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7] - ) + rng = np.random.default_rng(seed=0) + pdf1 = pd.DataFrame(rng.integers(10, size=(5, 3)), columns=[1, 2, 3]) + pdf2 = pd.DataFrame(rng.integers(10, size=(5, 4)), columns=[4, 5, 6, 7]) gdf1 = cudf.from_pandas(pdf1) gdf2 = cudf.from_pandas(pdf2) @@ -1398,11 +1398,12 @@ def test_concat_single_object(ignore_index, typ): ], ) def test_concat_decimal_dataframe(ltype, rtype): + rng = np.random.default_rng(seed=0) gdf1 = cudf.DataFrame( - {"id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"]} + {"id": rng.integers(0, 10, 3), "val": ["22.3", "59.5", "81.1"]} ) gdf2 = cudf.DataFrame( - {"id": np.random.randint(0, 10, 3), "val": ["2.35", "5.59", "8.14"]} + {"id": rng.integers(0, 10, 3), "val": ["2.35", "5.59", "8.14"]} ) gdf1["val"] = gdf1["val"].astype(ltype) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 9b6f82ec705..f33cfe268a3 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -16,8 +16,9 @@ @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) def test_repeat(dtype): - arr = np.random.rand(10) * 10 - repeats = np.random.randint(10, size=10) + rng = np.random.default_rng(seed=0) + arr = rng.random(10) * 10 + repeats = rng.integers(10, size=10) psr = pd.Series(arr).astype(dtype) gsr = cudf.from_pandas(psr) @@ -25,18 +26,20 @@ def test_repeat(dtype): def test_repeat_index(): + rng = np.random.default_rng(seed=0) arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] psr = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) gsr = cudf.from_pandas(psr) - repeats = np.random.randint(10, size=4) + repeats = rng.integers(10, size=4) assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) def test_repeat_dataframe(): + rng = np.random.default_rng(seed=0) psr = pd.DataFrame({"a": [1, 1, 2, 2]}) gsr = cudf.from_pandas(psr) - repeats = np.random.randint(10, size=4) + repeats = rng.integers(10, size=4) # pd.DataFrame doesn't have repeat() so as a workaround, we are # comparing pd.Series.repeat() with cudf.DataFrame.repeat()['a'] @@ -45,7 +48,8 @@ def test_repeat_dataframe(): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) def test_repeat_scalar(dtype): - arr = np.random.rand(10) * 10 + rng = np.random.default_rng(seed=0) + arr = rng.random(10) * 10 repeats = 10 psr = pd.Series(arr).astype(dtype) gsr = cudf.from_pandas(psr) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index b6efc8ebd88..ac772c47e3a 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1764,13 +1764,13 @@ def test_csv_writer_multiindex(tmpdir): pdf_df_fname = tmpdir.join("pdf_df_3.csv") gdf_df_fname = tmpdir.join("gdf_df_3.csv") - np.random.seed(0) + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { - "a": np.random.randint(0, 5, 20), - "b": np.random.randint(0, 5, 20), + "a": rng.integers(0, 5, 20), + "b": rng.integers(0, 5, 20), "c": range(20), - "d": np.random.random(20), + "d": rng.random(20), } ) gdg = gdf.groupby(["a", "b"]).mean() @@ -2277,3 +2277,20 @@ def test_read_header_none_pandas_compat_column_type(): result = cudf.read_csv(StringIO(data), header=None).columns expected = pd.read_csv(StringIO(data), header=None).columns pd.testing.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize("buffer", ["1", '"one"']) +def test_read_single_unterminated_row(buffer): + gdf = cudf.read_csv(StringIO(buffer), header=None) + assert_eq(gdf.shape, (1, 1)) + + +@pytest.mark.parametrize("buffer", ["\n", "\r\n"]) +def test_read_empty_only_row(buffer): + gdf = cudf.read_csv(StringIO(buffer), header=None) + assert_eq(gdf.shape, (0, 0)) + + +def test_read_empty_only_row_custom_terminator(): + gdf = cudf.read_csv(StringIO("*"), header=None, lineterminator="*") + assert_eq(gdf.shape, (0, 0)) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6f88d942746..0f2b41888fa 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -428,7 +428,7 @@ def test_series_init_none(): def test_dataframe_basic(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = cudf.DataFrame() # Populate with cuda memory @@ -437,7 +437,7 @@ def test_dataframe_basic(): assert len(df) == 10 # Populate with numpy array - rnd_vals = np.random.random(10) + rnd_vals = rng.random(10) df["vals"] = rnd_vals np.testing.assert_equal(df["vals"].to_numpy(), rnd_vals) assert len(df) == 10 @@ -1238,8 +1238,9 @@ def test_empty_dataframe_to_cupy(): df = cudf.DataFrame() nelem = 123 + rng = np.random.default_rng(seed=0) for k in "abc": - df[k] = np.random.random(nelem) + df[k] = rng.random(nelem) # Check all columns in empty dataframe. mat = df.head(0).to_cupy() @@ -1250,8 +1251,9 @@ def test_dataframe_to_cupy(): df = cudf.DataFrame() nelem = 123 + rng = np.random.default_rng(seed=0) for k in "abcd": - df[k] = np.random.random(nelem) + df[k] = rng.random(nelem) # Check all columns mat = df.to_cupy() @@ -1279,8 +1281,9 @@ def test_dataframe_to_cupy_null_values(): na = -10000 refvalues = {} + rng = np.random.default_rng(seed=0) for k in "abcd": - df[k] = data = np.random.random(nelem) + df[k] = data = rng.random(nelem) bitmask = utils.random_bitmask(nelem) df[k] = df[k]._column.set_mask(bitmask) boolmask = np.asarray( @@ -1321,10 +1324,11 @@ def test_dataframe_append_empty(): def test_dataframe_setitem_from_masked_object(): - ary = np.random.randn(100) + rng = np.random.default_rng(seed=0) + ary = rng.standard_normal(100) mask = np.zeros(100, dtype=bool) mask[:20] = True - np.random.shuffle(mask) + rng.shuffle(mask) ary[mask] = np.nan test1_null = cudf.Series(ary, nan_as_null=True) @@ -1534,14 +1538,12 @@ def test_dataframe_hash_values_xxhash64(): @pytest.mark.parametrize("nparts", [1, 2, 8, 13]) @pytest.mark.parametrize("nkeys", [1, 2]) def test_dataframe_hash_partition(nrows, nparts, nkeys): - np.random.seed(123) - gdf = cudf.DataFrame() - keycols = [] - for i in range(nkeys): - keyname = f"key{i}" - gdf[keyname] = np.random.randint(0, 7 - i, nrows) - keycols.append(keyname) - gdf["val1"] = np.random.randint(0, nrows * 2, nrows) + rng = np.random.default_rng(seed=0) + gdf = cudf.DataFrame( + {f"key{i}": rng.integers(0, 7 - i, nrows) for i in range(nkeys)} + ) + keycols = gdf.columns.to_list() + gdf["val1"] = rng.integers(0, nrows * 2, nrows) got = gdf.partition_by_hash(keycols, nparts=nparts) # Must return a list @@ -1751,8 +1753,9 @@ def test_concat_with_axis(): assert_eq(concat_cdf_s, concat_s, check_index_type=True) + rng = np.random.default_rng(seed=0) # concat series and dataframes - s3 = pd.Series(np.random.random(5)) + s3 = pd.Series(rng.random(5)) cs3 = cudf.Series.from_pandas(s3) concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1) @@ -1787,13 +1790,14 @@ def test_concat_with_axis(): check_index_type=True, ) + rng = np.random.default_rng(seed=0) # concat groupby multi index gdf1 = cudf.DataFrame( { - "x": np.random.randint(0, 10, 10), - "y": np.random.randint(0, 10, 10), - "z": np.random.randint(0, 10, 10), - "v": np.random.randint(0, 10, 10), + "x": rng.integers(0, 10, 10), + "y": rng.integers(0, 10, 10), + "z": rng.integers(0, 10, 10), + "v": rng.integers(0, 10, 10), } ) gdf2 = gdf1[5:] @@ -1833,14 +1837,14 @@ def test_concat_with_axis(): @pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000]) def test_nonmatching_index_setitem(nrows): - np.random.seed(0) + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame() - gdf["a"] = np.random.randint(2147483647, size=nrows) - gdf["b"] = np.random.randint(2147483647, size=nrows) + gdf["a"] = rng.integers(2147483647, size=nrows) + gdf["b"] = rng.integers(2147483647, size=nrows) gdf = gdf.set_index("b") - test_values = np.random.randint(2147483647, size=nrows) + test_values = rng.integers(2147483647, size=nrows) gdf["c"] = test_values assert len(test_values) == len(gdf["c"]) gdf_series = cudf.Series(test_values, index=gdf.index, name="c") @@ -1974,10 +1978,11 @@ def test_index_in_dataframe_constructor(): @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) @pytest.mark.parametrize("data_type", dtypes) def test_from_arrow(nelem, data_type): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), + "a": rng.integers(0, 1000, nelem).astype(data_type), + "b": rng.integers(0, 1000, nelem).astype(data_type), } ) padf = pa.Table.from_pandas( @@ -2012,10 +2017,11 @@ def test_from_arrow_chunked_categories(): @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) @pytest.mark.parametrize("data_type", dtypes) def test_to_arrow(nelem, data_type): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), + "a": rng.integers(0, 1000, nelem).astype(data_type), + "b": rng.integers(0, 1000, nelem).astype(data_type), } ) gdf = cudf.DataFrame.from_pandas(df) @@ -2119,17 +2125,16 @@ def test_to_arrow_missing_categorical(): @pytest.mark.parametrize("data_type", dtypes) def test_from_scalar_typing(data_type): + rng = np.random.default_rng(seed=0) if data_type == "datetime64[ms]": scalar = ( - np.dtype("int64") - .type(np.random.randint(0, 5)) - .astype("datetime64[ms]") + np.dtype("int64").type(rng.integers(0, 5)).astype("datetime64[ms]") ) elif data_type.startswith("datetime64"): scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]") data_type = "datetime64[ms]" else: - scalar = np.dtype(data_type).type(np.random.randint(0, 5)) + scalar = np.dtype(data_type).type(rng.integers(0, 5)) gdf = cudf.DataFrame() gdf["a"] = [1, 2, 3, 4, 5] @@ -2140,7 +2145,8 @@ def test_from_scalar_typing(data_type): @pytest.mark.parametrize("data_type", NUMERIC_TYPES) def test_from_python_array(data_type): - np_arr = np.random.randint(0, 100, 10).astype(data_type) + rng = np.random.default_rng(seed=0) + np_arr = rng.integers(0, 100, 10).astype(data_type) data = memoryview(np_arr) data = arr.array(data.format, data) @@ -2220,7 +2226,7 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): # against pandas nullable types as they are the ones that closely # resemble `cudf` dtypes behavior. pdf = pd.DataFrame() - + rng = np.random.default_rng(seed=0) null_rep = np.nan if dtype in ["float32", "float64"] else None np_dtype = dtype dtype = np.dtype(dtype) @@ -2228,13 +2234,11 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): for i in range(num_cols): colname = string.ascii_lowercase[i] data = pd.Series( - np.random.randint(0, 26, num_rows).astype(np_dtype), + rng.integers(0, 26, num_rows).astype(np_dtype), dtype=dtype, ) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) if len(idx): data[idx] = null_rep elif nulls == "all": @@ -2652,8 +2656,8 @@ def test_unaryops_df(pdf, unaryop, col_name, assign_col_name): def test_df_abs(pdf): - np.random.seed(0) - disturbance = pd.Series(np.random.rand(10)) + rng = np.random.default_rng(seed=0) + disturbance = pd.Series(rng.random(10)) pdf = pdf - 5 + disturbance d = pdf.apply(np.abs) g = cudf.from_pandas(pdf).abs() @@ -2706,8 +2710,9 @@ def test_iteritems(gdf): def test_quantile(q, numeric_only): ts = pd.date_range("2018-08-24", periods=5, freq="D") td = pd.to_timedelta(np.arange(5), unit="h") + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - {"date": ts, "delta": td, "val": np.random.randn(len(ts))} + {"date": ts, "delta": td, "val": rng.standard_normal(len(ts))} ) gdf = cudf.DataFrame.from_pandas(pdf) @@ -2729,9 +2734,10 @@ def test_quantile(q, numeric_only): [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], ) def test_decimal_quantile(q, interpolation, decimal_type): + rng = np.random.default_rng(seed=0) data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"] gdf = cudf.DataFrame( - {"id": np.random.randint(0, 10, size=len(data)), "val": data} + {"id": rng.integers(0, 10, size=len(data)), "val": data} ) gdf["id"] = gdf["id"].astype("float64") gdf["val"] = gdf["val"].astype(decimal_type(7, 2)) @@ -2843,9 +2849,9 @@ def test_cuda_array_interface(dtype): @pytest.mark.parametrize("nchunks", [1, 2, 5, 10]) @pytest.mark.parametrize("data_type", dtypes) def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): + rng = np.random.default_rng(seed=0) np_list_data = [ - np.random.randint(0, 100, nelem).astype(data_type) - for i in range(nchunks) + rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks) ] pa_chunk_array = pa.chunked_array(np_list_data) @@ -2855,8 +2861,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): assert_eq(expect, got) np_list_data2 = [ - np.random.randint(0, 100, nelem).astype(data_type) - for i in range(nchunks) + rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks) ] pa_chunk_array2 = pa.chunked_array(np_list_data2) pa_table = pa.Table.from_arrays( @@ -2881,11 +2886,13 @@ def query_GPU_memory(note=""): cuda.current_context().deallocations.clear() nRows = int(1e8) nCols = 2 - dataNumpy = np.asfortranarray(np.random.rand(nRows, nCols)) + rng = np.random.default_rng(seed=0) + dataNumpy = np.asfortranarray(rng.random(nRows, nCols)) colNames = ["col" + str(iCol) for iCol in range(nCols)] pandasDF = pd.DataFrame(data=dataNumpy, columns=colNames, dtype=np.float32) cudaDF = cudf.core.DataFrame.from_pandas(pandasDF) - boolmask = cudf.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool")) + rng = np.random.default_rng(seed=0) + boolmask = cudf.Series(rng.integers(1, 2, len(cudaDF)).astype("bool")) memory_used = query_GPU_memory() cudaDF = cudaDF[boolmask] @@ -2903,7 +2910,8 @@ def query_GPU_memory(note=""): def test_boolmask(pdf, gdf): - boolmask = np.random.randint(0, 2, len(pdf)) > 0 + rng = np.random.default_rng(seed=0) + boolmask = rng.integers(0, 2, len(pdf)) > 0 gdf = gdf[boolmask] pdf = pdf[boolmask] assert_eq(pdf, gdf) @@ -2922,12 +2930,11 @@ def test_boolmask(pdf, gdf): ], ) def test_dataframe_boolmask(mask_shape): - pdf = pd.DataFrame() - for col in "abc": - pdf[col] = np.random.randint(0, 10, 3) - pdf_mask = pd.DataFrame() - for col in mask_shape[1]: - pdf_mask[col] = np.random.randint(0, 2, mask_shape[0]) > 0 + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame({col: rng.integers(0, 10, 3) for col in "abc"}) + pdf_mask = pd.DataFrame( + {col: rng.integers(0, 2, mask_shape[0]) > 0 for col in mask_shape[1]} + ) gdf = cudf.DataFrame.from_pandas(pdf) gdf_mask = cudf.DataFrame.from_pandas(pdf_mask) gdf = gdf[gdf_mask] @@ -2992,7 +2999,8 @@ def test_arrow_handle_no_index_name(pdf, gdf): def test_pandas_non_contiguious(): - arr1 = np.random.sample([5000, 10]) + rng = np.random.default_rng(seed=0) + arr1 = rng.random(size=(5000, 10)) assert arr1.flags["C_CONTIGUOUS"] is True df = pd.DataFrame(arr1) for col in df.columns: @@ -3052,10 +3060,11 @@ def test_series_rename(): @pytest.mark.parametrize("data_type", dtypes) @pytest.mark.parametrize("nelem", [0, 100]) def test_head_tail(nelem, data_type): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), + "a": rng.integers(0, 1000, nelem).astype(data_type), + "b": rng.integers(0, 1000, nelem).astype(data_type), } ) gdf = cudf.from_pandas(pdf) @@ -3308,15 +3317,15 @@ def test_set_index_verify_integrity(data, index, verify_integrity): @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("nelem", [10, 200, 1333]) def test_set_index_multi(drop, nelem): - np.random.seed(0) + rng = np.random.default_rng(seed=0) a = np.arange(nelem) - np.random.shuffle(a) + rng.shuffle(a) df = pd.DataFrame( { "a": a, - "b": np.random.randint(0, 4, size=nelem), - "c": np.random.uniform(low=0, high=4, size=nelem), - "d": np.random.choice(["green", "black", "white"], nelem), + "b": rng.integers(0, 4, size=nelem), + "c": rng.uniform(low=0, high=4, size=nelem), + "d": rng.choice(["green", "black", "white"], nelem), } ) df["e"] = df["d"].astype("category") @@ -3894,13 +3903,13 @@ def test_select_dtype_datetime_with_frequency(): def test_dataframe_describe_exclude(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) + df["y"] = rng.normal(10, 1, data_length) pdf = df.to_pandas() gdf_results = df.describe(exclude=["float"]) @@ -3910,13 +3919,13 @@ def test_dataframe_describe_exclude(): def test_dataframe_describe_include(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) + df["y"] = rng.normal(10, 1, data_length) pdf = df.to_pandas() gdf_results = df.describe(include=["int"]) pdf_results = pdf.describe(include=["int"]) @@ -3925,12 +3934,12 @@ def test_dataframe_describe_include(): def test_dataframe_describe_default(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["y"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) + df["y"] = rng.normal(10, 1, data_length) pdf = df.to_pandas() gdf_results = df.describe() pdf_results = pdf.describe() @@ -3939,14 +3948,14 @@ def test_dataframe_describe_default(): def test_series_describe_include_all(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) - df["animal"] = np.random.choice(["dog", "cat", "bird"], data_length) + df["y"] = rng.normal(10, 1, data_length) + df["animal"] = rng.choice(["dog", "cat", "bird"], data_length) pdf = df.to_pandas() gdf_results = df.describe(include="all") @@ -3962,13 +3971,13 @@ def test_series_describe_include_all(): def test_dataframe_describe_percentiles(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99] df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["y"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) + df["y"] = rng.normal(10, 1, data_length) pdf = df.to_pandas() gdf_results = df.describe(percentiles=sample_percentiles) pdf_results = pdf.describe(percentiles=sample_percentiles) @@ -4098,10 +4107,11 @@ def test_ndim(): ], ) def test_dataframe_round(decimals): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { "floats": np.arange(0.5, 10.5, 1), - "ints": np.random.normal(-100, 100, 10), + "ints": rng.normal(-100, 100, 10), "floats_with_na": np.array( [ 14.123, @@ -4117,9 +4127,9 @@ def test_dataframe_round(decimals): ] ), "floats_same": np.repeat([-0.6459412758761901], 10), - "bools": np.random.choice([True, None, False], 10), - "strings": np.random.choice(["abc", "xyz", None], 10), - "struct": np.random.choice([{"abc": 1}, {"xyz": 2}, None], 10), + "bools": rng.choice([True, None, False], 10), + "strings": rng.choice(["abc", "xyz", None], 10), + "struct": rng.choice([{"abc": 1}, {"xyz": 2}, None], 10), "list": [[1], [2], None, [4], [3]] * 2, } ) @@ -5811,10 +5821,11 @@ def test_memory_usage(deep, index, set_index): @pytest_xfail def test_memory_usage_string(): rows = int(100) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { "A": np.arange(rows, dtype="int32"), - "B": np.random.choice(["apple", "banana", "orange"], rows), + "B": rng.choice(["apple", "banana", "orange"], rows), } ) gdf = cudf.from_pandas(df) @@ -5837,10 +5848,11 @@ def test_memory_usage_string(): def test_memory_usage_cat(): rows = int(100) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { "A": np.arange(rows, dtype="int32"), - "B": np.random.choice(["apple", "banana", "orange"], rows), + "B": rng.choice(["apple", "banana", "orange"], rows), } ) df["B"] = df.B.astype("category") @@ -5870,13 +5882,14 @@ def test_memory_usage_list(): def test_memory_usage_multi(rows): # We need to sample without replacement to guarantee that the size of the # levels are always the same. + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { "A": np.arange(rows, dtype="int32"), - "B": np.random.choice( + "B": rng.choice( np.arange(rows, dtype="int64"), rows, replace=False ), - "C": np.random.choice( + "C": rng.choice( np.arange(rows, dtype="float64"), rows, replace=False ), } @@ -6698,8 +6711,16 @@ def test_dataframe_init_1d_list(data, columns): (cupy.array([11, 123, -2342, 232]), ["z"], [0, 1, 1, 0]), (cupy.array([11, 123, -2342, 232]), ["z"], [1, 2, 3, 4]), (cupy.array([11, 123, -2342, 232]), ["z"], ["a", "z", "d", "e"]), - (np.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]), - (np.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]), + ( + np.random.default_rng(seed=0).standard_normal(size=(2, 4)), + ["a", "b", "c", "d"], + ["a", "b"], + ), + ( + np.random.default_rng(seed=0).standard_normal(size=(2, 4)), + ["a", "b", "c", "d"], + [1, 0], + ), (cupy.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]), (cupy.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]), ], @@ -6873,8 +6894,9 @@ def test_dataframe_info_basic(): memory usage: 859.0+ bytes """ ) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - np.random.randn(10, 10), + rng.standard_normal(size=(10, 10)), index=["a", "2", "3", "4", "5", "6", "7", "8", "100", "1111"], ) cudf.from_pandas(df).info(buf=buffer, verbose=True) @@ -9374,8 +9396,8 @@ def test_dataframe_roundtrip_arrow_struct_dtype(gdf): def test_dataframe_setitem_cupy_array(): - np.random.seed(0) - pdf = pd.DataFrame(np.random.randn(10, 2)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.standard_normal(size=(10, 2))) gdf = cudf.from_pandas(pdf) gpu_array = cupy.array([True, False] * 5) @@ -10161,7 +10183,7 @@ def df_eval(request): } ) int_max = 10 - rng = cupy.random.default_rng(0) + rng = cupy.random.default_rng(seed=0) return cudf.DataFrame( { "a": rng.integers(N, size=int_max), @@ -10529,11 +10551,12 @@ def test_dataframe_init_length_error(data, index): def test_dataframe_binop_with_mixed_date_types(): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - np.random.rand(2, 2), + rng.random(size=(2, 2)), columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), ) - ser = pd.Series(np.random.rand(3), index=[0, 1, 2]) + ser = pd.Series(rng.random(size=3), index=[0, 1, 2]) gdf = cudf.from_pandas(df) gser = cudf.from_pandas(ser) expected = df - ser @@ -10542,9 +10565,10 @@ def test_dataframe_binop_with_mixed_date_types(): def test_dataframe_binop_with_mixed_string_types(): - df1 = pd.DataFrame(np.random.rand(3, 3), columns=pd.Index([0, 1, 2])) + rng = np.random.default_rng(seed=0) + df1 = pd.DataFrame(rng.random(size=(3, 3)), columns=pd.Index([0, 1, 2])) df2 = pd.DataFrame( - np.random.rand(6, 6), + rng.random(size=(6, 6)), columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]), ) gdf1 = cudf.from_pandas(df1) @@ -10557,7 +10581,8 @@ def test_dataframe_binop_with_mixed_string_types(): def test_dataframe_binop_and_where(): - df = pd.DataFrame(np.random.rand(2, 2), columns=pd.Index([True, False])) + rng = np.random.default_rng(seed=0) + df = pd.DataFrame(rng.random(size=(2, 2)), columns=pd.Index([True, False])) gdf = cudf.from_pandas(df) expected = df > 1 @@ -10572,12 +10597,13 @@ def test_dataframe_binop_and_where(): def test_dataframe_binop_with_datetime_index(): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - np.random.rand(2, 2), + rng.random(size=(2, 2)), columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), ) ser = pd.Series( - np.random.rand(2), + rng.random(2), index=pd.Index( [ "2000-01-04", @@ -10615,8 +10641,8 @@ def test_dataframe_dict_like_with_columns(columns, index): def test_dataframe_init_columns_named_multiindex(): - np.random.seed(0) - data = np.random.randn(2, 2) + rng = np.random.default_rng(seed=0) + data = rng.standard_normal(size=(2, 2)) columns = cudf.MultiIndex.from_tuples( [("A", "one"), ("A", "two")], names=["y", "z"] ) @@ -10627,8 +10653,8 @@ def test_dataframe_init_columns_named_multiindex(): def test_dataframe_init_columns_named_index(): - np.random.seed(0) - data = np.random.randn(2, 2) + rng = np.random.default_rng(seed=0) + data = rng.standard_normal(size=(2, 2)) columns = pd.Index(["a", "b"], name="custom_name") gdf = cudf.DataFrame(data, columns=columns) pdf = pd.DataFrame(data, columns=columns) @@ -11146,3 +11172,12 @@ def test_from_pandas_preserve_column_dtype(): df = pd.DataFrame([[1, 2]], columns=pd.Index([1, 2], dtype="int8")) result = cudf.DataFrame.from_pandas(df) pd.testing.assert_index_equal(result.columns, df.columns, exact=True) + + +def test_dataframe_init_column(): + s = cudf.Series([1, 2, 3]) + with pytest.raises(TypeError): + cudf.DataFrame(s._column) + expect = cudf.DataFrame({"a": s}) + actual = cudf.DataFrame._from_arrays(s._column, columns=["a"]) + assert_eq(expect, actual) diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 45bd31ef58e..3aedbf8365b 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -93,11 +93,15 @@ def test_dataframe_deep_copy_and_insert(copy_parameters): @pytest.mark.parametrize("ncols", [0, 1, 10]) @pytest.mark.parametrize("data_type", ALL_TYPES) def test_cudf_dataframe_copy(copy_fn, ncols, data_type): - pdf = pd.DataFrame() - for i in range(ncols): - pdf[chr(i + ord("a"))] = pd.Series( - np.random.randint(0, 1000, 20) - ).astype(data_type) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype( + data_type + ) + for i in range(ncols) + } + ) df = DataFrame.from_pandas(pdf) copy_df = copy_fn(df) assert_eq(df, copy_df) @@ -116,18 +120,20 @@ def test_cudf_dataframe_copy(copy_fn, ncols, data_type): @pytest.mark.parametrize("ncols", [0, 1, 10]) @pytest.mark.parametrize("data_type", ALL_TYPES) def test_cudf_dataframe_copy_then_insert(copy_fn, ncols, data_type): - pdf = pd.DataFrame() - for i in range(ncols): - pdf[chr(i + ord("a"))] = pd.Series( - np.random.randint(0, 1000, 20) - ).astype(data_type) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype( + data_type + ) + for i in range(ncols) + } + ) df = DataFrame.from_pandas(pdf) copy_df = copy_fn(df) copy_pdf = copy_fn(pdf) - copy_df["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype(data_type) - copy_pdf["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype( - data_type - ) + copy_df["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type) + copy_pdf["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type) assert not copy_pdf.to_string().split() == pdf.to_string().split() assert not copy_df.to_string().split() == df.to_string().split() diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4a2345fc009..b7403c12bcd 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -216,17 +216,21 @@ def test_setitem_datetime(): def test_sort_datetime(): - df = pd.DataFrame() - df["date"] = np.array( - [ - np.datetime64("2016-11-20"), - np.datetime64("2020-11-20"), - np.datetime64("2019-11-20"), - np.datetime64("1918-11-20"), - np.datetime64("2118-11-20"), - ] + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + { + "date": np.array( + [ + np.datetime64("2016-11-20"), + np.datetime64("2020-11-20"), + np.datetime64("2019-11-20"), + np.datetime64("1918-11-20"), + np.datetime64("2118-11-20"), + ] + ), + "vals": rng.random(5), + } ) - df["vals"] = np.random.sample(len(df["date"])) gdf = cudf.from_pandas(df) @@ -432,11 +436,12 @@ def test_datetime_to_arrow(dtype): ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_datetime_unique(data, nulls): + rng = np.random.default_rng(seed=0) psr = data.copy() if len(data) > 0: if nulls == "some": - p = np.random.randint(0, len(data), 2) + p = rng.integers(0, len(data), 2) psr[p] = None gsr = cudf.from_pandas(psr) @@ -461,10 +466,11 @@ def test_datetime_unique(data, nulls): @pytest.mark.parametrize("nulls", ["none", "some"]) def test_datetime_nunique(data, nulls): psr = data.copy() + rng = np.random.default_rng(seed=0) if len(data) > 0: if nulls == "some": - p = np.random.randint(0, len(data), 2) + p = rng.integers(0, len(data), 2) psr[p] = None gsr = cudf.from_pandas(psr) @@ -2525,23 +2531,7 @@ def test_dti_asi8(): @pytest.mark.parametrize( "method, kwargs", - [ - ["mean", {}], - pytest.param( - "std", - {}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - pytest.param( - "std", - {"ddof": 0}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - ], + [["mean", {}], ["std", {}], ["std", {"ddof": 0}]], ) def test_dti_reduction(method, kwargs): pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index ebcc35784ee..20c24bd7564 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -42,9 +42,10 @@ def data_1d(request): nelems = request.param[0] dtype = request.param[1] nulls = request.param[2] - a = np.random.randint(10, size=nelems).astype(dtype) + rng = np.random.default_rng(seed=0) + a = rng.integers(10, size=nelems).astype(dtype) if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating): - idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False) + idx = rng.choice(a.size, size=int(a.size * 0.2), replace=False) a[idx] = np.nan return a @@ -55,9 +56,10 @@ def data_2d(request): nrows = request.param[1] dtype = request.param[2] nulls = request.param[3] - a = np.random.randint(10, size=(nrows, ncols)).astype(dtype) + rng = np.random.default_rng(seed=0) + a = rng.integers(10, size=(nrows, ncols)).astype(dtype) if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating): - idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False) + idx = rng.choice(a.size, size=int(a.size * 0.2), replace=False) a.ravel()[idx] = np.nan return np.ascontiguousarray(a) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 5b1ee0ffac6..eeac78dbebc 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -22,13 +22,13 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): psr = pd.Series(data) - + rng = np.random.default_rng(seed=0) if len(data) > 0: if nulls == "one": - p = np.random.randint(0, 4) + p = rng.integers(0, 4) psr[p] = None elif nulls == "some": - p1, p2 = np.random.randint(0, 4, (2,)) + p1, p2 = rng.integers(0, 4, (2,)) psr[p1] = None psr[p2] = None elif nulls == "all": diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 0b4ed52ba96..67dd7a8388b 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -368,9 +368,13 @@ def test_dataframe_drop_duplicates_method(): def test_datetime_drop_duplicates(): - date_df = cudf.DataFrame() - date_df["date"] = pd.date_range("11/20/2018", periods=6, freq="D") - date_df["value"] = np.random.sample(len(date_df)) + rng = np.random.default_rng(seed=0) + date_df = cudf.DataFrame( + { + "date": pd.date_range("11/20/2018", periods=6, freq="D"), + "value": rng.random(6), + } + ) df = concat([date_df, date_df[:4]]) assert_eq(df[:-4], df.drop_duplicates()) @@ -585,7 +589,8 @@ def test_drop_duplicates_multi_index(): ] idx = pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"]) - pdf = pd.DataFrame(np.random.randint(0, 2, (8, 4)), index=idx) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.integers(0, 2, (8, 4)), index=idx) gdf = cudf.DataFrame.from_pandas(pdf) expected = pdf.drop_duplicates() diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 47f9180dcb1..cfb4ae2c0f8 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -13,13 +13,16 @@ @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) def test_factorize_series_obj(ncats, nelem): df = DataFrame() - np.random.seed(0) + rng = np.random.default_rng(seed=0) # initialize data frame - df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) + df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32) uvals, labels = df["cats"].factorize() - np.testing.assert_array_equal(labels.to_numpy(), sorted(set(arr))) + unique_values, indices = np.unique(arr, return_index=True) + expected_values = unique_values[np.argsort(indices)] + + np.testing.assert_array_equal(labels.to_numpy(), expected_values) assert isinstance(uvals, cp.ndarray) assert isinstance(labels, Index) @@ -31,14 +34,17 @@ def test_factorize_series_obj(ncats, nelem): @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) def test_factorize_index_obj(ncats, nelem): df = DataFrame() - np.random.seed(0) + rng = np.random.default_rng(seed=0) # initialize data frame - df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) + df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32) df = df.set_index("cats") uvals, labels = df.index.factorize() - np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) + unique_values, indices = np.unique(arr, return_index=True) + expected_values = unique_values[np.argsort(indices)] + + np.testing.assert_array_equal(labels.values.get(), expected_values) assert isinstance(uvals, cp.ndarray) assert isinstance(labels, Index) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 7e5523bb8c7..f93bd2c5d32 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -15,13 +15,14 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): + rng = np.random.default_rng(seed=0) types = NUMERIC_TYPES + ["bool"] nrows = request.param # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) for typ in types } ) @@ -30,7 +31,7 @@ def pdf(request): test_pdf.index.name = "index" # Create non-numeric categorical data otherwise may get typecasted - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] test_pdf["col_category"] = pd.Series(data, dtype="category") # Feather can't handle indexes properly diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 848bc259e7b..e4422e204bc 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -77,21 +77,21 @@ def make_frame( extra_vals=(), with_datetime=False, ): - np.random.seed(seed) + rng = np.random.default_rng(seed=seed) df = dataframe_class() - df["x"] = np.random.randint(0, 5, nelem) - df["y"] = np.random.randint(0, 3, nelem) + df["x"] = rng.integers(0, 5, nelem) + df["y"] = rng.integers(0, 3, nelem) for lvl in extra_levels: - df[lvl] = np.random.randint(0, 2, nelem) + df[lvl] = rng.integers(0, 2, nelem) - df["val"] = np.random.random(nelem) + df["val"] = rng.random(nelem) for val in extra_vals: - df[val] = np.random.random(nelem) + df[val] = rng.random(nelem) if with_datetime: - df["datetime"] = np.random.randint( + df["datetime"] = rng.integers( _now, _tomorrow, nelem, dtype=np.int64 ).astype("datetime64[ns]") @@ -266,9 +266,10 @@ def test_groupby_getitem_getattr(as_index): def test_groupby_cats(): - df = DataFrame() - df["cats"] = pd.Categorical(list("aabaacaab")) - df["vals"] = np.random.random(len(df)) + rng = np.random.default_rng(seed=0) + df = DataFrame( + {"cats": pd.Categorical(list("aabaacaab")), "vals": rng.random(9)} + ) cats = df["cats"].values_host vals = df["vals"].to_numpy() @@ -285,13 +286,16 @@ def test_groupby_cats(): def test_groupby_iterate_groups(): - np.random.seed(0) - df = DataFrame() + rng = np.random.default_rng(seed=0) nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) + df = DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) @@ -307,13 +311,16 @@ def assert_values_equal(arr): reason="Fails in older versions of pandas", ) def test_groupby_apply(): - np.random.seed(0) - df = DataFrame() + rng = np.random.default_rng(seed=0) nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) + df = DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) expect_grpby = df.to_pandas().groupby( ["key1", "key2"], as_index=False, group_keys=False @@ -351,13 +358,16 @@ def f3(df, k, L, m): reason="Fails in older versions of pandas", ) def test_groupby_apply_args(func, args): - np.random.seed(0) - df = DataFrame() + rng = np.random.default_rng(seed=0) nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) + df = DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) expect_grpby = df.to_pandas().groupby( ["key1", "key2"], as_index=False, group_keys=False @@ -369,7 +379,6 @@ def test_groupby_apply_args(func, args): def test_groupby_apply_grouped(): - np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = range(nelem) @@ -1010,6 +1019,7 @@ def test_groupby_2keys_agg(nelem, func): # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], ) def test_groupby_agg_decimal(num_groups, nelem_per_group, func): + rng = np.random.default_rng(seed=0) # The number of digits after the decimal to use. decimal_digits = 2 # The number of digits before the decimal to use. @@ -1026,8 +1036,8 @@ def test_groupby_agg_decimal(num_groups, nelem_per_group, func): # https://github.com/pandas-dev/pandas/issues/40685). However, if that is # ever enabled, then this issue will crop up again so we may as well have # it fixed now. - x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) - y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) + x = np.unique((rng.random(nelem) * scale).round(decimal_digits)) + y = np.unique((rng.random(nelem) * scale).round(decimal_digits)) if x.size < y.size: total_elements = x.size @@ -1313,9 +1323,9 @@ def test_empty_groupby(func): def test_groupby_unsupported_columns(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) pd_cat = pd.Categorical( - pd.Series(np.random.choice(["a", "b", 1], 3), dtype="category") + pd.Series(rng.choice(["a", "b", 1], 3), dtype="category") ) pdf = pd.DataFrame( { @@ -1421,10 +1431,11 @@ def test_groupby_apply_basic_agg_single_column(): def test_groupby_multi_agg_single_groupby_series(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) gdf = cudf.from_pandas(pdf) @@ -1435,12 +1446,13 @@ def test_groupby_multi_agg_single_groupby_series(): def test_groupby_multi_agg_multi_groupby(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "a": np.random.randint(0, 5, 10), - "b": np.random.randint(0, 5, 10), - "c": np.random.randint(0, 5, 10), - "d": np.random.randint(0, 5, 10), + "a": rng.integers(0, 5, 10), + "b": rng.integers(0, 5, 10), + "c": rng.integers(0, 5, 10), + "d": rng.integers(0, 5, 10), } ) gdf = cudf.from_pandas(pdf) @@ -1450,6 +1462,7 @@ def test_groupby_multi_agg_multi_groupby(): def test_groupby_datetime_multi_agg_multi_groupby(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { "a": pd.date_range( @@ -1457,9 +1470,9 @@ def test_groupby_datetime_multi_agg_multi_groupby(): datetime.datetime.now() + datetime.timedelta(9), freq="D", ), - "b": np.random.randint(0, 5, 10), - "c": np.random.randint(0, 5, 10), - "d": np.random.randint(0, 5, 10), + "b": rng.integers(0, 5, 10), + "c": rng.integers(0, 5, 10), + "d": rng.integers(0, 5, 10), } ) gdf = cudf.from_pandas(pdf) @@ -1940,6 +1953,23 @@ def test_groupby_nunique(agg, by): assert_groupby_results_equal(expect, got, check_dtype=False) +@pytest.mark.parametrize("dropna", [True, False]) +def test_nunique_dropna(dropna): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2], + "b": [4, None, 5], + "c": [None, None, 7], + "d": [1, 1, 3], + } + ) + pdf = gdf.to_pandas() + + result = gdf.groupby("a")["b"].nunique(dropna=dropna) + expected = pdf.groupby("a")["b"].nunique(dropna=dropna) + assert_groupby_results_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize( "n", [0, 1, 2, 10], @@ -4029,3 +4059,19 @@ def test_ndim(): pgb = pser.groupby([0, 0, 1]) ggb = gser.groupby(cudf.Series([0, 0, 1])) assert pgb.ndim == ggb.ndim + + +@pytest.mark.skipif( + not PANDAS_GE_220, reason="pandas behavior applicable in >=2.2" +) +def test_get_group_list_like(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.groupby(["a"]).get_group((1,)) + expected = df.to_pandas().groupby(["a"]).get_group((1,)) + assert_eq(result, expected) + + with pytest.raises(KeyError): + df.groupby(["a"]).get_group((1, 2)) + + with pytest.raises(KeyError): + df.groupby(["a"]).get_group([1]) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3f483219423..24d42d9eb4c 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2645,21 +2645,20 @@ def test_isin_multiindex(data, values, level, err): ) -range_data = [ - range(np.random.randint(0, 100)), - range(9, 12, 2), - range(20, 30), - range(100, 1000, 10), - range(0, 10, -2), - range(0, -10, 2), - range(0, -10, -2), -] - - -@pytest.fixture(params=range_data) +@pytest.fixture( + params=[ + range(np.random.default_rng(seed=0).integers(0, 100)), + range(9, 12, 2), + range(20, 30), + range(100, 1000, 10), + range(0, 10, -2), + range(0, -10, 2), + range(0, -10, -2), + ] +) def rangeindex(request): """Create a cudf RangeIndex of different `nrows`""" - return RangeIndex(request.param) + return cudf.RangeIndex(request.param) @pytest.mark.parametrize( @@ -2830,21 +2829,20 @@ def test_rangeindex_append_return_rangeindex(): assert_eq(result, expected) -index_data = [ - range(np.random.randint(0, 100)), - range(0, 10, -2), - range(0, -10, 2), - range(0, -10, -2), - range(0, 1), - [1, 2, 3, 1, None, None], - [None, None, 3.2, 1, None, None], - [None, "a", "3.2", "z", None, None], - pd.Series(["a", "b", None], dtype="category"), - np.array([1, 2, 3, None], dtype="datetime64[s]"), -] - - -@pytest.fixture(params=index_data) +@pytest.fixture( + params=[ + range(np.random.default_rng(seed=0).integers(0, 100)), + range(0, 10, -2), + range(0, -10, 2), + range(0, -10, -2), + range(0, 1), + [1, 2, 3, 1, None, None], + [None, None, 3.2, 1, None, None], + [None, "a", "3.2", "z", None, None], + pd.Series(["a", "b", None], dtype="category"), + np.array([1, 2, 3, None], dtype="datetime64[s]"), + ] +) def index(request): """Create a cudf Index of different dtypes""" return cudf.Index(request.param) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 00ae99466bb..421bc0c298b 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -32,7 +32,8 @@ def pdf_gdf(): @pytest.fixture def pdf_gdf_multi(): - pdf = pd.DataFrame(np.random.rand(7, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(7, 5))) pdfIndex = pd.MultiIndex( [ ["a", "b", "c"], @@ -212,12 +213,17 @@ def test_dataframe_column_name_indexing(): df[1].to_numpy(), np.asarray(range(10), dtype=np.int32) ) + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() nelem = 10 - pdf["key1"] = np.random.randint(0, 5, nelem) - pdf["key2"] = np.random.randint(0, 3, nelem) - pdf[1] = np.arange(1, 1 + nelem) - pdf[2] = np.random.random(nelem) + pdf = pd.DataFrame( + { + "key1": rng.integers(0, 5, nelem), + "key2": rng.integers(0, 3, nelem), + 1: np.arange(1, 1 + nelem), + 2: rng.random(nelem), + } + ) df = cudf.from_pandas(pdf) assert_eq(df[df.columns], df) @@ -239,16 +245,13 @@ def test_dataframe_column_name_indexing(): def test_dataframe_slicing(): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame() size = 123 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) - df["b"] = hb = np.random.random(size).astype(np.float32) - df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype( - np.int64 - ) - df["d"] = hd = np.random.random(size).astype(np.float64) + df["a"] = ha = rng.integers(low=0, high=100, size=size).astype(np.int32) + df["b"] = hb = rng.random(size).astype(np.float32) + df["c"] = hc = rng.integers(low=0, high=100, size=size).astype(np.int64) + df["d"] = hd = rng.random(size).astype(np.float64) # Row slice first 10 first_10 = df[:10] @@ -287,12 +290,13 @@ def test_dataframe_slicing(): @pytest.mark.parametrize("scalar", [0, 20, 100]) def test_dataframe_loc(scalar, step): size = 123 + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "a": np.random.randint(low=0, high=100, size=size), - "b": np.random.random(size).astype(np.float32), - "c": np.random.random(size).astype(np.float64), - "d": np.random.random(size).astype(np.float64), + "a": rng.integers(low=0, high=100, size=size), + "b": rng.random(size).astype(np.float32), + "c": rng.random(size).astype(np.float64), + "d": rng.random(size).astype(np.float64), } ) pdf.index.name = "index" @@ -392,12 +396,11 @@ def test_dataframe_loc_mask(mask, arg): def test_dataframe_loc_outbound(): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame() size = 10 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) - df["b"] = hb = np.random.random(size).astype(np.float32) + df["a"] = ha = rng.integers(low=0, high=100, size=size).astype(np.int32) + df["b"] = hb = rng.random(size).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha @@ -590,8 +593,8 @@ def test_dataframe_series_loc_multiindex(obj): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_series_iloc(nelem): # create random cudf.Series - np.random.seed(12) - ps = pd.Series(np.random.sample(nelem)) + rng = np.random.default_rng(seed=0) + ps = pd.Series(rng.random(nelem)) # gpu cudf.Series gs = cudf.Series(ps) @@ -625,12 +628,11 @@ def test_series_iloc(nelem): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_dataframe_iloc(nelem): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame() - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32) + gdf["b"] = hb = rng.random(nelem).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha @@ -679,12 +681,11 @@ def test_dataframe_iloc(nelem): def test_dataframe_iloc_tuple(): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame() nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32) + gdf["b"] = hb = rng.random(nelem).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha @@ -695,12 +696,11 @@ def test_dataframe_iloc_tuple(): def test_dataframe_iloc_index_error(): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame() nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32) + gdf["b"] = hb = rng.random(nelem).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha @@ -714,14 +714,16 @@ def test_dataframe_iloc_index_error(): @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) def test_dataframe_take(ntake): - np.random.seed(0) - df = cudf.DataFrame() - + rng = np.random.default_rng(seed=0) nelem = 123 - df["ii"] = np.random.randint(0, 20, nelem) - df["ff"] = np.random.random(nelem) + df = cudf.DataFrame( + { + "ii": rng.integers(0, 20, nelem), + "ff": rng.random(nelem), + } + ) - take_indices = np.random.randint(0, len(df), ntake) + take_indices = rng.integers(0, len(df), ntake) actual = df.take(take_indices) expected = df.to_pandas().take(take_indices) @@ -733,7 +735,7 @@ def test_dataframe_take(ntake): @pytest.mark.parametrize("ntake", [1, 2, 8, 9]) def test_dataframe_take_with_multiindex(ntake): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = cudf.DataFrame( index=cudf.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], @@ -742,10 +744,10 @@ def test_dataframe_take_with_multiindex(ntake): ) nelem = 9 - df["ii"] = np.random.randint(0, 20, nelem) - df["ff"] = np.random.random(nelem) + df["ii"] = rng.integers(0, 20, nelem) + df["ff"] = rng.random(nelem) - take_indices = np.random.randint(0, len(df), ntake) + take_indices = rng.integers(0, len(df), ntake) actual = df.take(take_indices) expected = df.to_pandas().take(take_indices) @@ -755,13 +757,13 @@ def test_dataframe_take_with_multiindex(ntake): @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) def test_series_take(ntake): - np.random.seed(0) + rng = np.random.default_rng(seed=0) nelem = 123 - psr = pd.Series(np.random.randint(0, 20, nelem)) + psr = pd.Series(rng.integers(0, 20, nelem)) gsr = cudf.Series(psr) - take_indices = np.random.randint(0, len(gsr), ntake) + take_indices = rng.integers(0, len(gsr), ntake) actual = gsr.take(take_indices) expected = psr.take(take_indices) @@ -841,14 +843,15 @@ def test_empty_boolean_mask(dtype): ) @pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) def test_series_apply_boolean_mask(data, mask, nulls): + rng = np.random.default_rng(seed=0) psr = pd.Series(data) if len(data) > 0: if nulls == "one": - p = np.random.randint(0, 4) + p = rng.integers(0, 4) psr[p] = None elif nulls == "some": - p1, p2 = np.random.randint(0, 4, (2,)) + p1, p2 = rng.integers(0, 4, (2,)) psr[p1] = None psr[p2] = None elif nulls == "all": @@ -1810,13 +1813,14 @@ def test_boolean_mask_columns_iloc_series(): @pytest.mark.parametrize("index_type", ["single", "slice"]) def test_loc_timestamp_issue_8585(index_type): + rng = np.random.default_rng(seed=0) # https://github.com/rapidsai/cudf/issues/8585 start = pd.Timestamp( datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") ) end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M")) timestamps = pd.date_range(start, end, periods=12) - value = np.random.normal(size=12) + value = rng.normal(size=12) df = pd.DataFrame(value, index=timestamps, columns=["value"]) cdf = cudf.from_pandas(df) if index_type == "single": @@ -1851,6 +1855,7 @@ def test_loc_timestamp_issue_8585(index_type): ], ) def test_loc_multiindex_timestamp_issue_8585(index_type): + rng = np.random.default_rng(seed=0) # https://github.com/rapidsai/cudf/issues/8585 start = pd.Timestamp( datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") @@ -1861,7 +1866,7 @@ def test_loc_multiindex_timestamp_issue_8585(index_type): index = pd.MultiIndex.from_product( [timestamps, labels], names=["timestamp", "label"] ) - value = np.random.normal(size=12) + value = rng.normal(size=12) df = pd.DataFrame(value, index=index, columns=["value"]) cdf = cudf.from_pandas(df) start = pd.Timestamp( diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index b1ce69e58ef..f6941ce7fae 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -22,7 +22,7 @@ def make_params(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) hows = _JOIN_TYPES @@ -39,14 +39,14 @@ def make_params(): yield (aa, bb, how) # Test large random integer inputs - aa = np.random.randint(0, 50, 100) - bb = np.random.randint(0, 50, 100) + aa = rng.integers(0, 50, 100) + bb = rng.integers(0, 50, 100) for how in hows: yield (aa, bb, how) # Test floating point inputs - aa = np.random.random(50) - bb = np.random.random(50) + aa = rng.random(50) + bb = rng.random(50) for how in hows: yield (aa, bb, how) @@ -162,9 +162,9 @@ def _check_series(expect, got): reason="bug in older version of pandas", ) def test_dataframe_join_suffix(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) - df = cudf.DataFrame(np.random.randint(0, 5, (5, 3)), columns=list("abc")) + df = cudf.DataFrame(rng.integers(0, 5, (5, 3)), columns=list("abc")) left = df.set_index("a") right = df.set_index("c") @@ -281,19 +281,19 @@ def test_dataframe_join_mismatch_cats(how): @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None]) def test_dataframe_merge_on(on): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # Make cuDF df_left = cudf.DataFrame() nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) + df_left["key1"] = rng.integers(0, 40, nelem) + df_left["key2"] = rng.integers(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = cudf.DataFrame() nelem = 500 - df_right["key1"] = np.random.randint(0, 30, nelem) - df_right["key2"] = np.random.randint(0, 50, nelem) + df_right["key1"] = rng.integers(0, 30, nelem) + df_right["key2"] = rng.integers(0, 50, nelem) df_right["right_val"] = np.arange(nelem) # Make pandas DF @@ -347,19 +347,19 @@ def test_dataframe_merge_on(on): def test_dataframe_merge_on_unknown_column(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # Make cuDF df_left = cudf.DataFrame() nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) + df_left["key1"] = rng.integers(0, 40, nelem) + df_left["key2"] = rng.integers(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = cudf.DataFrame() nelem = 500 - df_right["key1"] = np.random.randint(0, 30, nelem) - df_right["key2"] = np.random.randint(0, 50, nelem) + df_right["key1"] = rng.integers(0, 30, nelem) + df_right["key2"] = rng.integers(0, 50, nelem) df_right["right_val"] = np.arange(nelem) with pytest.raises(KeyError) as raises: @@ -368,19 +368,19 @@ def test_dataframe_merge_on_unknown_column(): def test_dataframe_merge_no_common_column(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # Make cuDF df_left = cudf.DataFrame() nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) + df_left["key1"] = rng.integers(0, 40, nelem) + df_left["key2"] = rng.integers(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = cudf.DataFrame() nelem = 500 - df_right["key3"] = np.random.randint(0, 30, nelem) - df_right["key4"] = np.random.randint(0, 50, nelem) + df_right["key3"] = rng.integers(0, 30, nelem) + df_right["key4"] = rng.integers(0, 50, nelem) df_right["right_val"] = np.arange(nelem) with pytest.raises(ValueError) as raises: @@ -460,14 +460,14 @@ def test_dataframe_merge_order(): @pytest.mark.parametrize("rows", [1, 5, 100]) @pytest.mark.parametrize("how", ["left", "inner", "outer"]) def test_dataframe_pairs_of_triples(pairs, max, rows, how): - np.random.seed(0) + rng = np.random.default_rng(seed=0) pdf_left = pd.DataFrame() pdf_right = pd.DataFrame() for left_column in pairs[0]: - pdf_left[left_column] = np.random.randint(0, max, rows) + pdf_left[left_column] = rng.integers(0, max, rows) for right_column in pairs[1]: - pdf_right[right_column] = np.random.randint(0, max, rows) + pdf_right[right_column] = rng.integers(0, max, rows) gdf_left = cudf.from_pandas(pdf_left) gdf_right = cudf.from_pandas(pdf_right) if not set(pdf_left.columns).intersection(pdf_right.columns): @@ -504,15 +504,15 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): def test_safe_merging_with_left_empty(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) pairs = ("bcd", "b") pdf_left = pd.DataFrame() pdf_right = pd.DataFrame() for left_column in pairs[0]: - pdf_left[left_column] = np.random.randint(0, 10, 0) + pdf_left[left_column] = rng.integers(0, 10, 0) for right_column in pairs[1]: - pdf_right[right_column] = np.random.randint(0, 10, 5) + pdf_right[right_column] = rng.integers(0, 10, 5) gdf_left = cudf.from_pandas(pdf_left) gdf_right = cudf.from_pandas(pdf_right) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index c81c2d1d94b..47976fc4bac 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -32,13 +32,14 @@ def make_numeric_dataframe(nrows, dtype): @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): + rng = np.random.default_rng(seed=0) types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] nrows = request.param # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) for typ in types } ) diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 790e84559a9..a34c89f55d3 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -164,7 +164,8 @@ def test_series(testlist): def test_multiindex(): - pdf = pd.DataFrame(np.random.rand(7, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(7, 5))) pdf.index = pd.MultiIndex( [ ["a", "b", "c"], diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index c41be3e4428..ad0e0858c43 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -153,7 +153,8 @@ def test_multiindex_swaplevel(): def test_string_index(): - pdf = pd.DataFrame(np.random.rand(5, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(5, 5))) gdf = cudf.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] pdf.index = stringIndex @@ -176,7 +177,8 @@ def test_string_index(): def test_multiindex_row_shape(): - pdf = pd.DataFrame(np.random.rand(0, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(0, 5))) gdf = cudf.from_pandas(pdf) pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]]) pdfIndex.names = ["alpha"] @@ -193,7 +195,8 @@ def test_multiindex_row_shape(): @pytest.fixture def pdf(): - return pd.DataFrame(np.random.rand(7, 5)) + rng = np.random.default_rng(seed=0) + return pd.DataFrame(rng.random(size=(7, 5))) @pytest.fixture @@ -271,7 +274,8 @@ def test_from_pandas_series(): def test_series_multiindex(pdfIndex): - ps = pd.Series(np.random.rand(7)) + rng = np.random.default_rng(seed=0) + ps = pd.Series(rng.random(7)) gs = cudf.from_pandas(ps) ps.index = pdfIndex gs.index = cudf.from_pandas(pdfIndex) @@ -439,7 +443,8 @@ def test_multiindex_loc_rows_1_1_key(pdf, gdf, pdfIndex): def test_multiindex_column_shape(): - pdf = pd.DataFrame(np.random.rand(5, 0)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(5, 0))) gdf = cudf.from_pandas(pdf) pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]]) pdfIndex.names = ["alpha"] @@ -522,9 +527,13 @@ def test_multiindex_from_product(arrays): def test_multiindex_index_and_columns(): - gdf = cudf.DataFrame() - gdf["x"] = np.random.randint(0, 5, 5) - gdf["y"] = np.random.randint(0, 5, 5) + rng = np.random.default_rng(seed=0) + gdf = cudf.DataFrame( + { + "x": rng.integers(0, 5, 5), + "y": rng.integers(0, 5, 5), + } + ) pdf = gdf.to_pandas() mi = cudf.MultiIndex( levels=[[0, 1, 2], [3, 4]], @@ -542,11 +551,12 @@ def test_multiindex_index_and_columns(): def test_multiindex_multiple_groupby(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { "a": [4, 17, 4, 9, 5], "b": [1, 4, 4, 3, 2], - "x": np.random.normal(size=5), + "x": rng.normal(size=5), } ) gdf = cudf.DataFrame.from_pandas(pdf) @@ -566,11 +576,12 @@ def test_multiindex_multiple_groupby(): ], ) def test_multi_column(func): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=1000), - "y": np.random.randint(0, 10, size=1000), - "z": np.random.normal(size=1000), + "x": rng.integers(0, 5, size=1000), + "y": rng.integers(0, 10, size=1000), + "z": rng.normal(size=1000), } ) gdf = cudf.DataFrame.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 1dd732c7191..41c1c3ccb20 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -681,7 +681,6 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): from pyarrow import orc - np.random.seed(0) supported_stat_types = supported_numpy_dtypes + ["str"] # Writing bool columns to multiple row groups is disabled # until #6763 is fixed @@ -704,6 +703,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): has_nulls=True, low=0, high=max_char_length, + seed=0, ) for dtype in supported_stat_types } @@ -845,7 +845,6 @@ def test_orc_reader_gmt_timestamps(datadir): def test_orc_bool_encode_fail(): - np.random.seed(0) buffer = BytesIO() # Generate a boolean column longer than a single row group @@ -927,7 +926,6 @@ def test_empty_string_columns(data): [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], ) def test_orc_writer_decimal(tmpdir, scale, decimal_type): - np.random.seed(0) fname = tmpdir / "decimal.orc" expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)}) @@ -988,7 +986,7 @@ def test_orc_string_stream_offset_issue(): def generate_list_struct_buff(size=100_000): rd = random.Random(1) - np.random.seed(seed=1) + rng = np.random.default_rng(seed=1) buff = BytesIO() @@ -999,12 +997,12 @@ def generate_list_struct_buff(size=100_000): [ [ [ - rd.choice([None, np.random.randint(1, 3)]) - for _ in range(np.random.randint(1, 3)) + rd.choice([None, rng.integers(1, 3)]) + for _ in range(rng.integers(1, 3)) ] - for _ in range(np.random.randint(0, 3)) + for _ in range(rng.integers(0, 3)) ] - for _ in range(np.random.randint(0, 3)) + for _ in range(rng.integers(0, 3)) ], ] ) @@ -1012,8 +1010,8 @@ def generate_list_struct_buff(size=100_000): ] lvl1_list = [ [ - rd.choice([None, np.random.randint(0, 3)]) - for _ in range(np.random.randint(1, 4)) + rd.choice([None, rng.integers(0, 3)]) + for _ in range(rng.integers(1, 4)) ] for _ in range(size) ] @@ -1021,7 +1019,7 @@ def generate_list_struct_buff(size=100_000): rd.choice( [ None, - {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)}, + {"a": rng.integers(0, 3), "b": rng.integers(0, 3)}, ] ) for _ in range(size) @@ -1030,11 +1028,11 @@ def generate_list_struct_buff(size=100_000): rd.choice( [ None, - {"a": rd.choice([None, np.random.randint(0, 3)])}, + {"a": rd.choice([None, rng.integers(0, 3)])}, { "lvl1_struct": { - "c": rd.choice([None, np.random.randint(0, 3)]), - "d": np.random.randint(0, 3), + "c": rd.choice([None, rng.integers(0, 3)]), + "d": rng.integers(0, 3), }, }, ] @@ -1044,7 +1042,7 @@ def generate_list_struct_buff(size=100_000): list_nests_struct = [ [ {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)} - for _ in range(np.random.randint(1, 4)) + for _ in range(rng.integers(1, 4)) ] for _ in range(size) ] @@ -1135,7 +1133,7 @@ def gen_map_buff(size): from pyarrow import orc rd = random.Random(1) - np.random.seed(seed=1) + rng = np.random.default_rng(seed=1) buff = BytesIO() @@ -1146,7 +1144,7 @@ def gen_map_buff(size): None, { rd.choice(al): rd.choice( - [None, np.random.randint(1, 1500)] + [None, rng.integers(1, 1500)] ), }, ] @@ -1167,7 +1165,7 @@ def gen_map_buff(size): None, [ rd.choice( - [None, np.random.randint(1, 1500)] + [None, rng.integers(1, 1500)] ) for _ in range(5) ], @@ -1194,10 +1192,10 @@ def gen_map_buff(size): None, { "a": rd.choice( - [None, np.random.randint(1, 1500)] + [None, rng.integers(1, 1500)] ), "b": rd.choice( - [None, np.random.randint(1, 1500)] + [None, rng.integers(1, 1500)] ), }, ] diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py index ad78621c5fa..b474bbe9bd8 100644 --- a/python/cudf/cudf/tests/test_pack.py +++ b/python/cudf/cudf/tests/test_pack.py @@ -24,11 +24,11 @@ def test_sizeof_packed_dataframe(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 1000 df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) - df["vals"] = hvals = np.random.random(nelem) + df["vals"] = hvals = rng.random(nelem) packed = pack(df) nbytes = hkeys.nbytes + hvals.nbytes @@ -67,46 +67,46 @@ def assert_packed_frame_equality(df): def test_packed_dataframe_equality_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_packed_equality(df) def test_packed_dataframe_equality_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_equality(df) def test_packed_dataframe_equality_list(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_equality(df) def test_packed_dataframe_equality_struct(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series( list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_equality(df) @@ -135,46 +135,46 @@ def assert_packed_frame_unique_pointers(df): def test_packed_dataframe_unique_pointers_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_packed_unique_pointers(df) def test_packed_dataframe_unique_pointers_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_unique_pointers(df) def test_packed_dataframe_unique_pointers_list(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_unique_pointers(df) def test_packed_dataframe_unique_pointers_struct(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series( list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_unique_pointers(df) @@ -208,46 +208,46 @@ def assert_packed_frame_picklable(df): def test_pickle_packed_dataframe_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_packed_pickled_equality(df) def test_pickle_packed_dataframe_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_pickled_equality(df) def test_pickle_packed_dataframe_list(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_pickled_equality(df) def test_pickle_packed_dataframe_struct(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series( list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_pickled_equality(df) @@ -273,45 +273,45 @@ def assert_packed_frame_serializable(df): def test_serialize_packed_dataframe_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_packed_serialized_equality(df) def test_serialize_packed_dataframe_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_serialized_equality(df) def test_serialize_packed_dataframe_list(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_serialized_equality(df) def test_serialize_packed_dataframe_struct(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series( list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_serialized_equality(df) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 7f1b0b1cd46..659d2ebd89a 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -53,6 +53,7 @@ def datadir(datadir): @pytest.fixture(params=[1, 5, 10, 100000]) def simple_pdf(request): + rng = np.random.default_rng(seed=0) types = [ "bool", "int8", @@ -72,7 +73,7 @@ def simple_pdf(request): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) for typ in types }, # Need to ensure that this index is not a RangeIndex to get the @@ -92,6 +93,7 @@ def simple_gdf(simple_pdf): def build_pdf(num_columns, day_resolution_timestamps): + rng = np.random.default_rng(seed=0) types = [ "bool", "int8", @@ -114,7 +116,7 @@ def build_pdf(num_columns, day_resolution_timestamps): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) for typ in types }, # Need to ensure that this index is not a RangeIndex to get the @@ -142,7 +144,7 @@ def build_pdf(num_columns, day_resolution_timestamps): }, ]: data = [ - np.random.randint(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"])) + rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"])) for i in range(nrows) ] if day_resolution_timestamps: @@ -152,11 +154,11 @@ def build_pdf(num_columns, day_resolution_timestamps): ) # Create non-numeric categorical data otherwise parquet may typecast it - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] test_pdf["col_category"] = pd.Series(data, dtype="category") # Create non-numeric str data - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] test_pdf["col_str"] = pd.Series(data, dtype="str") return test_pdf @@ -191,11 +193,6 @@ def parquet_file(request, tmp_path_factory, pdf): return fname -@pytest.fixture(scope="module") -def rdg_seed(): - return int(os.environ.get("TEST_CUDF_RDG_SEED", "42")) - - def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): test_pdf = pd.DataFrame( [list(range(ncolumns * i, ncolumns * (i + 1))) for i in range(nrows)], @@ -403,14 +400,14 @@ def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes): assert_eq(expect, got) -def test_parquet_read_metadata(tmpdir, pdf): +def test_parquet_read_metadata(tmp_path, pdf): if len(pdf) > 100: pytest.skip("Skipping long setup test") def num_row_groups(rows, group_size): return max(1, (rows + (group_size - 1)) // group_size) - fname = tmpdir.join("metadata.parquet") + fname = tmp_path / "metadata.parquet" row_group_size = 5 pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) @@ -429,7 +426,7 @@ def num_row_groups(rows, group_size): assert a == b -def test_parquet_read_filtered(tmpdir, rdg_seed): +def test_parquet_read_filtered(tmpdir): # Generate data fname = tmpdir.join("filtered.parquet") dg.generate( @@ -453,11 +450,13 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): dg.ColumnParameters( 40, 0.2, - lambda: np.random.default_rng().integers(0, 100, size=40), + lambda: np.random.default_rng(seed=0).integers( + 0, 100, size=40 + ), True, ), ], - seed=rdg_seed, + seed=42, ), format={"name": "parquet", "row_group_size": 64}, ) @@ -1909,6 +1908,7 @@ def test_parquet_writer_dictionary_setting(use_dict, max_dict_size): @pytest.mark.parametrize("filename", ["myfile.parquet", None]) @pytest.mark.parametrize("cols", [["b"], ["c", "b"]]) def test_parquet_partitioned(tmpdir_factory, cols, filename): + rng = np.random.default_rng(seed=0) # Checks that write_to_dataset is wrapping to_parquet # as expected gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) @@ -1917,8 +1917,8 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename): pdf = pd.DataFrame( { "a": np.arange(0, stop=size, dtype="int64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), } ) pdf.to_parquet(pdf_dir, index=False, partition_cols=cols) @@ -1954,6 +1954,7 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename): @pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}]) def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs): + rng = np.random.default_rng(seed=0) # Checks that write_to_dataset is wrapping to_parquet # as expected pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) @@ -1961,8 +1962,8 @@ def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs): pdf = pd.DataFrame( { "a": np.arange(0, stop=size, dtype="int64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), } ) pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"]) @@ -2127,6 +2128,7 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): @pytest.mark.parametrize("cols", [None, ["b"]]) @pytest.mark.parametrize("store_schema", [True, False]) def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): + rng = np.random.default_rng(seed=0) dir1 = tmpdir_factory.mktemp("dir1") dir2 = tmpdir_factory.mktemp("dir2") if cols is None: @@ -2139,7 +2141,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): gdf = cudf.DataFrame( { "a": np.arange(0, stop=size), - "b": np.random.choice(np.arange(4), size=size), + "b": rng.choice(np.arange(4), size=size), } ) gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) @@ -3214,11 +3216,12 @@ def test_parquet_nested_struct_list(): def test_parquet_writer_zstd(): size = 12345 + rng = np.random.default_rng(seed=0) expected = cudf.DataFrame( { "a": np.arange(0, stop=size, dtype="float64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), } ) @@ -3768,10 +3771,10 @@ def test_parquet_chunked_reader( chunk_read_limit, pass_read_limit, use_pandas_metadata, row_groups ): df = pd.DataFrame( - {"a": [1, 2, 3, 4] * 1000000, "b": ["av", "qw", "hi", "xyz"] * 1000000} + {"a": [1, 2, 3, None] * 10000, "b": ["av", "qw", None, "xyz"] * 10000} ) buffer = BytesIO() - df.to_parquet(buffer) + df.to_parquet(buffer, row_group_size=10000) actual = read_parquet_chunked( [buffer], chunk_read_limit=chunk_read_limit, @@ -3785,6 +3788,108 @@ def test_parquet_chunked_reader( assert_eq(expected, actual) +@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("num_rows", [997, 2997, None]) +def test_parquet_chunked_reader_structs( + chunk_read_limit, + pass_read_limit, + num_rows, +): + data = [ + { + "a": "g", + "b": { + "b_a": 10, + "b_b": {"b_b_b": None, "b_b_a": 2}, + }, + "c": None, + }, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, + {"a": "j", "b": None, "c": [8, 10]}, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, + None, + { + "a": None, + "b": {"b_a": None, "b_b": {"b_b_b": 1}}, + "c": [18, 19], + }, + {"a": None, "b": None, "c": None}, + ] * 1000 + + pa_struct = pa.Table.from_pydict({"struct": data}) + df = cudf.DataFrame.from_arrow(pa_struct) + buffer = BytesIO() + df.to_parquet(buffer) + + # Number of rows to read + nrows = num_rows if num_rows is not None else len(df) + + actual = read_parquet_chunked( + [buffer], + chunk_read_limit=chunk_read_limit, + pass_read_limit=pass_read_limit, + nrows=nrows, + ) + expected = cudf.read_parquet( + buffer, + nrows=nrows, + ) + assert_eq(expected, actual) + + +@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("num_rows", [4997, 9997, None]) +@pytest.mark.parametrize( + "str_encoding", + [ + "PLAIN", + "DELTA_BYTE_ARRAY", + "DELTA_LENGTH_BYTE_ARRAY", + ], +) +def test_parquet_chunked_reader_string_decoders( + chunk_read_limit, + pass_read_limit, + num_rows, + str_encoding, +): + df = pd.DataFrame( + { + "i64": [1, 2, 3, None] * 10000, + "str": ["av", "qw", "asd", "xyz"] * 10000, + "list": list( + [["ad", "cd"], ["asd", "fd"], None, ["asd", None]] * 10000 + ), + } + ) + buffer = BytesIO() + # Write 4 Parquet row groups with string column encoded + df.to_parquet( + buffer, + row_group_size=10000, + use_dictionary=False, + column_encoding={"str": str_encoding}, + ) + + # Number of rows to read + nrows = num_rows if num_rows is not None else len(df) + + # Check with num_rows specified + actual = read_parquet_chunked( + [buffer], + chunk_read_limit=chunk_read_limit, + pass_read_limit=pass_read_limit, + nrows=nrows, + ) + expected = cudf.read_parquet( + buffer, + nrows=nrows, + ) + assert_eq(expected, actual) + + @pytest.mark.parametrize( "nrows,skip_rows", [ diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 0f13a9e173a..2f10a5dfd74 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -40,33 +40,33 @@ def assert_frame_picklable(df): def test_pickle_dataframe_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_serialization(df) def test_pickle_dataframe_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_serialization(df) def test_memory_usage_dataframe(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 1000 df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) - df["vals"] = hvals = np.random.random(nelem) + df["vals"] = hvals = rng.random(nelem) nbytes = hkeys.nbytes + hvals.nbytes sizeof = df.memory_usage().sum() @@ -98,11 +98,11 @@ def test_pickle_buffer(): @pytest.mark.parametrize("named", [True, False]) def test_pickle_series(named): - np.random.seed(0) + rng = np.random.default_rng(seed=0) if named: - ser = Series(np.random.random(10), name="a") + ser = Series(rng.random(10), name="a") else: - ser = Series(np.random.random(10)) + ser = Series(rng.random(10)) pickled = pickle.dumps(ser) out = pickle.loads(pickled) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index b12209fd3b9..7685d09203e 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -45,10 +45,10 @@ def test_query(data, fn, nulls): # prepare nelem, seed = data expect_fn, query_expr = fn - np.random.seed(seed) + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() pdf["a"] = np.arange(nelem) - pdf["b"] = np.random.random(nelem) * nelem + pdf["b"] = rng.random(nelem) * nelem if nulls: pdf.loc[::2, "a"] = None gdf = cudf.from_pandas(pdf) @@ -71,10 +71,10 @@ def test_query_ref_env(data, fn): # prepare nelem, seed = data expect_fn, query_expr = fn - np.random.seed(seed) + rng = np.random.default_rng(seed=0) df = DataFrame() df["a"] = aa = np.arange(nelem) - df["b"] = bb = np.random.random(nelem) * nelem + df["b"] = bb = rng.random(nelem) * nelem c = 2.3 d = 1.2 # udt @@ -121,9 +121,9 @@ def test_query_local_dict(): def test_query_splitted_combine(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} + {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)} ) gdf = DataFrame.from_pandas(df) diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index 4c1d8ce92ae..1d9c6690f14 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -125,32 +125,28 @@ def test_rank_error_arguments(pdf): ) -sort_group_args = [ - np.full((3,), np.nan), - 100 * np.random.random(10), - np.full((3,), np.inf), - np.full((3,), -np.inf), -] -sort_dtype_args = [np.int32, np.int64, np.float32, np.float64] - - @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") @pytest.mark.parametrize( "elem,dtype", list( product( - combinations_with_replacement(sort_group_args, 4), - sort_dtype_args, + combinations_with_replacement( + [ + np.full((3,), np.nan), + 100 * np.random.default_rng(seed=0).random(10), + np.full((3,), np.inf), + np.full((3,), -np.inf), + ], + 4, + ), + [np.int32, np.int64, np.float32, np.float64], ) ), ) def test_series_rank_combinations(elem, dtype): - np.random.seed(0) aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype) - gdf = DataFrame() - df = pd.DataFrame() - gdf["a"] = aa - df["a"] = aa + gdf = DataFrame({"a": aa}) + df = pd.DataFrame({"a": aa}) ranked_gs = gdf["a"].rank(method="first") ranked_ps = df["a"].rank(method="first") # Check diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index f276f394cd0..e0bc8f32c9b 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -62,8 +62,7 @@ def test_sum_string(): ) @pytest.mark.parametrize("nelem", params_sizes) def test_sum_decimal(dtype, nelem): - np.random.seed(0) - data = [str(x) for x in gen_rand("int64", nelem) / 100] + data = [str(x) for x in gen_rand("int64", nelem, seed=0) / 100] expected = pd.Series([Decimal(x) for x in data]).sum() got = cudf.Series(data).astype(dtype).sum() @@ -73,15 +72,13 @@ def test_sum_decimal(dtype, nelem): @pytest.mark.parametrize("dtype,nelem", params) def test_product(dtype, nelem): - np.random.seed(0) + rng = np.random.default_rng(seed=0) dtype = cudf.dtype(dtype).type if cudf.dtype(dtype).kind in {"u", "i"}: data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): - data[np.random.randint(low=0, high=nelem, size=1)] = ( - np.random.uniform() * 2 - ) + data[rng.integers(low=0, high=nelem, size=1)] = rng.uniform() * 2 else: data = gen_rand(dtype, nelem) @@ -104,7 +101,6 @@ def test_product(dtype, nelem): ], ) def test_product_decimal(dtype): - np.random.seed(0) data = [str(x) for x in gen_rand("int8", 3) / 10] expected = pd.Series([Decimal(x) for x in data]).product() @@ -153,7 +149,6 @@ def test_sum_of_squares(dtype, nelem): ], ) def test_sum_of_squares_decimal(dtype): - np.random.seed(0) data = [str(x) for x in gen_rand("int8", 3) / 10] expected = pd.Series([Decimal(x) for x in data]).pow(2).sum() @@ -186,7 +181,6 @@ def test_min(dtype, nelem): ) @pytest.mark.parametrize("nelem", params_sizes) def test_min_decimal(dtype, nelem): - np.random.seed(0) data = [str(x) for x in gen_rand("int64", nelem) / 100] expected = pd.Series([Decimal(x) for x in data]).min() @@ -219,7 +213,6 @@ def test_max(dtype, nelem): ) @pytest.mark.parametrize("nelem", params_sizes) def test_max_decimal(dtype, nelem): - np.random.seed(0) data = [str(x) for x in gen_rand("int64", nelem) / 100] expected = pd.Series([Decimal(x) for x in data]).max() @@ -256,7 +249,8 @@ def test_sum_boolean(): def test_date_minmax(): - np_data = np.random.normal(size=10**3) + rng = np.random.default_rng(seed=0) + np_data = rng.normal(size=10**3) gdf_data = Series(np_data) np_casted = np_data.astype("datetime64[ms]") diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 3a8928297c0..d9f4ceaf3f7 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -13,6 +13,7 @@ from cudf.core._compat import ( PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_GE_220, + PANDAS_GT_214, PANDAS_VERSION, ) from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype @@ -116,8 +117,10 @@ def test_series_replace(): sr6 = sr1.replace([0, 1], [5, 6]) assert_eq(a6, sr6.to_numpy()) - with pytest.raises(TypeError): - sr1.replace([0, 1], [5.5, 6.5]) + assert_eq( + sr1.replace([0, 1], [5.5, 6.5]), + sr1.to_pandas().replace([0, 1], [5.5, 6.5]), + ) # Series input a8 = np.array([5, 5, 5, 3, 4]) @@ -160,8 +163,10 @@ def test_series_replace_with_nulls(): assert_eq(a6, sr6.to_numpy()) sr1 = cudf.Series([0, 1, 2, 3, 4, None]) - with pytest.raises(TypeError): - sr1.replace([0, 1], [5.5, 6.5]).fillna(-10) + assert_eq( + sr1.replace([0, 1], [5.5, 6.5]).fillna(-10), + sr1.to_pandas().replace([0, 1], [5.5, 6.5]).fillna(-10), + ) # Series input a8 = np.array([-10, -10, -10, 3, 4, -10]) @@ -967,30 +972,37 @@ def test_series_multiple_times_with_nulls(): @pytest.mark.parametrize( "replacement", [128, 128.0, 128.5, 32769, 32769.0, 32769.5] ) -def test_numeric_series_replace_dtype(series_dtype, replacement): +def test_numeric_series_replace_dtype(request, series_dtype, replacement): + request.applymarker( + pytest.mark.xfail( + condition=PANDAS_GT_214 + and ( + ( + series_dtype == "int8" + and replacement in {128, 128.0, 32769, 32769.0} + ) + or ( + series_dtype == "int16" and replacement in {32769, 32769.0} + ) + ), + reason="Pandas throws an AssertionError for these " + "cases and asks us to log a bug, they are trying to " + "avoid a RecursionError which cudf will not run into", + ) + ) psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) sr = cudf.from_pandas(psr) - numpy_replacement = np.array(replacement).astype(sr.dtype)[()] - can_replace = numpy_replacement == replacement + expect = psr.replace(1, replacement) + got = sr.replace(1, replacement) - # Both Scalar - if not can_replace: - with pytest.raises(TypeError): - sr.replace(1, replacement) - else: - expect = psr.replace(1, replacement).astype(psr.dtype) - got = sr.replace(1, replacement) - assert_eq(expect, got) + assert_eq(expect, got) # to_replace is a list, replacement is a scalar - if not can_replace: - with pytest.raises(TypeError): - sr.replace([2, 3], replacement) - else: - expect = psr.replace([2, 3], replacement).astype(psr.dtype) - got = sr.replace([2, 3], replacement) - assert_eq(expect, got) + expect = psr.replace([2, 3], replacement) + got = sr.replace([2, 3], replacement) + + assert_eq(expect, got) # If to_replace is a scalar and replacement is a list with pytest.raises(TypeError): @@ -1001,17 +1013,9 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): sr.replace([0, 1], [replacement]) # Both lists of equal length - if ( - np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"} - ) or (not can_replace): - with pytest.raises(TypeError): - sr.replace([2, 3], [replacement, replacement]) - else: - expect = psr.replace([2, 3], [replacement, replacement]).astype( - psr.dtype - ) - got = sr.replace([2, 3], [replacement, replacement]) - assert_eq(expect, got) + expect = psr.replace([2, 3], [replacement, replacement]) + got = sr.replace([2, 3], [replacement, replacement]) + assert_eq(expect, got) @pytest.mark.parametrize( @@ -1392,3 +1396,52 @@ def test_replace_with_index_objects(): result = cudf.Series([1, 2]).replace(cudf.Index([1]), cudf.Index([2])) expected = pd.Series([1, 2]).replace(pd.Index([1]), pd.Index([2])) assert_eq(result, expected) + + +# Example test function for datetime series replace +def test_replace_datetime_series(): + # Create a pandas datetime series + pd_series = pd.Series(pd.date_range("20210101", periods=5)) + # Replace a specific datetime value + pd_result = pd_series.replace( + pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-10") + ) + + # Create a cudf datetime series + cudf_series = cudf.Series(pd.date_range("20210101", periods=5)) + # Replace a specific datetime value + cudf_result = cudf_series.replace( + pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-10") + ) + + assert_eq(pd_result, cudf_result) + + +# Example test function for timedelta series replace +def test_replace_timedelta_series(): + # Create a pandas timedelta series + pd_series = pd.Series(pd.timedelta_range("1 days", periods=5)) + # Replace a specific timedelta value + pd_result = pd_series.replace( + pd.Timedelta("2 days"), pd.Timedelta("10 days") + ) + + # Create a cudf timedelta series + cudf_series = cudf.Series(pd.timedelta_range("1 days", periods=5)) + # Replace a specific timedelta value + cudf_result = cudf_series.replace( + pd.Timedelta("2 days"), pd.Timedelta("10 days") + ) + + assert_eq(pd_result, cudf_result) + + +def test_replace_multiple_rows(datadir): + path = datadir / "parquet" / "replace_multiple_rows.parquet" + pdf = pd.read_parquet(path) + gdf = cudf.read_parquet(path) + + pdf.replace([np.inf, -np.inf], np.nan, inplace=True) + gdf.replace([np.inf, -np.inf], np.nan, inplace=True) + + assert_eq(pdf, gdf, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 95e19fae501..bf0c97adb00 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -25,9 +25,10 @@ @pytest.mark.parametrize("dtype", repr_categories) @pytest.mark.parametrize("nrows", [0, 5, 10]) def test_null_series(nrows, dtype): + rng = np.random.default_rng(seed=0) size = 5 - sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype) - sr[np.random.choice([False, True], size=size)] = None + sr = cudf.Series(rng.integers(1, 9, size)).astype(dtype) + sr[rng.choice([False, True], size=size)] = None if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view(mode="read").copy_to_host(), @@ -60,11 +61,12 @@ def test_null_series(nrows, dtype): @pytest.mark.parametrize("ncols", [1, 2, 3, 4, 5, 10]) def test_null_dataframe(ncols): + rng = np.random.default_rng(seed=0) size = 20 gdf = cudf.DataFrame() for idx, dtype in enumerate(dtype_categories): - sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype) - sr[np.random.choice([False, True], size=size)] = None + sr = cudf.Series(rng.integers(0, 128, size)).astype(dtype) + sr[rng.choice([False, True], size=size)] = None gdf[dtype] = sr pdf = gdf.to_pandas() pd.options.display.max_columns = int(ncols) @@ -77,7 +79,8 @@ def test_null_dataframe(ncols): @pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21]) def test_full_series(nrows, dtype): size = 20 - ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype) + rng = np.random.default_rng(seed=0) + ps = pd.Series(rng.integers(0, 100, size)).astype(dtype) sr = cudf.from_pandas(ps) pd.options.display.max_rows = nrows assert repr(ps) == repr(sr) @@ -89,8 +92,9 @@ def test_full_series(nrows, dtype): @pytest.mark.parametrize("size", [20, 21]) @pytest.mark.parametrize("dtype", repr_categories) def test_full_dataframe_20(dtype, size, nrows, ncols): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - {idx: np.random.randint(0, 100, size) for idx in range(size)} + {idx: rng.integers(0, 100, size) for idx in range(size)} ).astype(dtype) gdf = cudf.from_pandas(pdf) @@ -178,11 +182,12 @@ def test_mixed_series(mixed_pdf, mixed_gdf): def test_MI(): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { - "a": np.random.randint(0, 4, 10), - "b": np.random.randint(0, 4, 10), - "c": np.random.randint(0, 4, 10), + "a": rng.integers(0, 4, 10), + "b": rng.integers(0, 4, 10), + "c": rng.integers(0, 4, 10), } ) levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] @@ -223,9 +228,10 @@ def test_groupby_MI(nrows, ncols): @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) @pytest.mark.parametrize("length", [0, 1, 10, 100, 1000]) def test_generic_index(length, dtype): + rng = np.random.default_rng(seed=0) psr = pd.Series( range(length), - index=np.random.randint(0, high=100, size=length).astype(dtype), + index=rng.integers(0, high=100, size=length).astype(dtype), dtype="float64" if length == 0 else None, ) gsr = cudf.Series.from_pandas(psr) diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index a61477981f8..5ff0098bcf4 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -50,8 +50,9 @@ def test_series_upsample_simple(): @pytest.mark.parametrize("rule", ["2s", "10s"]) def test_series_resample_ffill(rule): - rng = pd.date_range("1/1/2012", periods=10, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) + date_idx = pd.date_range("1/1/2012", periods=10, freq="5s") + rng = np.random.default_rng(seed=0) + ts = pd.Series(rng.integers(0, 500, len(date_idx)), index=date_idx) gts = cudf.from_pandas(ts) assert_resample_results_equal( ts.resample(rule).ffill(), gts.resample(rule).ffill() @@ -60,8 +61,9 @@ def test_series_resample_ffill(rule): @pytest.mark.parametrize("rule", ["2s", "10s"]) def test_series_resample_bfill(rule): - rng = pd.date_range("1/1/2012", periods=10, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) + date_idx = pd.date_range("1/1/2012", periods=10, freq="5s") + rng = np.random.default_rng(seed=0) + ts = pd.Series(rng.integers(0, 500, len(date_idx)), index=date_idx) gts = cudf.from_pandas(ts) assert_resample_results_equal( ts.resample(rule).bfill(), gts.resample(rule).bfill() @@ -70,8 +72,9 @@ def test_series_resample_bfill(rule): @pytest.mark.parametrize("rule", ["2s", "10s"]) def test_series_resample_asfreq(rule): - rng = pd.date_range("1/1/2012", periods=100, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) + date_range = pd.date_range("1/1/2012", periods=100, freq="5s") + rng = np.random.default_rng(seed=0) + ts = pd.Series(rng.integers(0, 500, len(date_range)), index=date_range) gts = cudf.from_pandas(ts) assert_resample_results_equal( ts.resample(rule).asfreq(), gts.resample(rule).asfreq() @@ -79,8 +82,9 @@ def test_series_resample_asfreq(rule): def test_dataframe_resample_aggregation_simple(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - np.random.randn(1000, 3), + rng.standard_normal(size=(1000, 3)), index=pd.date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) @@ -91,8 +95,9 @@ def test_dataframe_resample_aggregation_simple(): def test_dataframe_resample_multiagg(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - np.random.randn(1000, 3), + rng.standard_normal(size=(1000, 3)), index=pd.date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) @@ -104,10 +109,11 @@ def test_dataframe_resample_multiagg(): def test_dataframe_resample_on(): + rng = np.random.default_rng(seed=0) # test resampling on a specified column pdf = pd.DataFrame( { - "x": np.random.randn(1000), + "x": rng.standard_normal(size=(1000)), "y": pd.date_range("1/1/2012", freq="s", periods=1000), } ) @@ -119,15 +125,16 @@ def test_dataframe_resample_on(): def test_dataframe_resample_level(): + rng = np.random.default_rng(seed=0) # test resampling on a specific level of a MultIndex pdf = pd.DataFrame( { - "x": np.random.randn(1000), + "x": rng.standard_normal(size=1000), "y": pd.date_range("1/1/2012", freq="s", periods=1000), } ) pdi = pd.MultiIndex.from_frame(pdf) - pdf = pd.DataFrame({"a": np.random.randn(1000)}, index=pdi) + pdf = pd.DataFrame({"a": rng.standard_normal(size=1000)}, index=pdi) gdf = cudf.from_pandas(pdf) assert_resample_results_equal( pdf.resample("3min", level="y").mean(), @@ -153,11 +160,12 @@ def test_dataframe_resample_level(): reason="Fails in older versions of pandas", ) def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq): + rng = np.random.default_rng(seed=0) # test that we cast to the appropriate frequency # when resampling: pdf = pd.DataFrame( { - "x": np.random.randn(100), + "x": rng.standard_normal(size=100), "y": pd.date_range("1/1/2012", freq=in_freq, periods=100), } ) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 4235affd4d1..53fe5f7f30d 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -46,13 +46,12 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): pdf = pd.DataFrame() id_vars = [] + rng = np.random.default_rng(seed=0) for i in range(num_id_vars): colname = "id" + str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -62,11 +61,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): value_vars = [] for i in range(num_value_vars): colname = "val" + str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -119,6 +116,15 @@ def test_melt_str_scalar_id_var(): assert_eq(result, expected) +def test_melt_falsy_var_name(): + df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]}) + result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="") + expected = pd.melt( + df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name="" + ) + assert_eq(result, expected) + + @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( @@ -130,13 +136,12 @@ def test_df_stack(nulls, num_cols, num_rows, dtype): pytest.skip(reason="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame() + rng = np.random.default_rng(seed=0) for i in range(num_cols): colname = str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -271,8 +276,8 @@ def test_df_stack_multiindex_column_axis_pd_example(level): ], names=["exp", "animal", "hair_length"], ) - - df = pd.DataFrame(np.random.randn(4, 4), columns=columns) + rng = np.random.default_rng(seed=0) + df = pd.DataFrame(rng.standard_normal(size=(4, 4)), columns=columns) with expect_warning_if(PANDAS_GE_220, FutureWarning): expect = df.stack(level=level, future_stack=False) @@ -299,14 +304,13 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype): pytest.skip(reason="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame(dtype=dtype) + rng = np.random.default_rng(seed=0) for i in range(num_cols): colname = str(i) - data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype) + data = pd.Series(rng.integers(0, 26, num_rows)).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -335,16 +339,13 @@ def test_tile(nulls, num_cols, num_rows, dtype, count): pytest.skip(reason="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame(dtype=dtype) + rng = np.random.default_rng(seed=0) for i in range(num_cols): colname = str(i) - data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype( - dtype - ) + data = pd.Series(rng.integers(num_cols, 26, num_rows)).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -715,23 +716,20 @@ def test_pivot_duplicate_error(): @pytest.mark.parametrize( - "data", - [ + "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] +) +@pytest.mark.parametrize("fill_value", [0]) +def test_pivot_table_simple(aggfunc, fill_value): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( { "A": ["one", "one", "two", "three"] * 6, "B": ["A", "B", "C"] * 8, "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), + "D": rng.standard_normal(size=24), + "E": rng.standard_normal(size=24), } - ], -) -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) -@pytest.mark.parametrize("fill_value", [0]) -def test_pivot_table_simple(data, aggfunc, fill_value): - pdf = pd.DataFrame(data) + ) expected = pd.pivot_table( pdf, values=["D", "E"], @@ -740,7 +738,7 @@ def test_pivot_table_simple(data, aggfunc, fill_value): aggfunc=aggfunc, fill_value=fill_value, ) - cdf = cudf.DataFrame(data) + cdf = cudf.DataFrame.from_pandas(pdf) actual = cudf.pivot_table( cdf, values=["D", "E"], @@ -753,23 +751,20 @@ def test_pivot_table_simple(data, aggfunc, fill_value): @pytest.mark.parametrize( - "data", - [ + "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] +) +@pytest.mark.parametrize("fill_value", [0]) +def test_dataframe_pivot_table_simple(aggfunc, fill_value): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( { "A": ["one", "one", "two", "three"] * 6, "B": ["A", "B", "C"] * 8, "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), + "D": rng.standard_normal(size=24), + "E": rng.standard_normal(size=24), } - ], -) -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) -@pytest.mark.parametrize("fill_value", [0]) -def test_dataframe_pivot_table_simple(data, aggfunc, fill_value): - pdf = pd.DataFrame(data) + ) expected = pdf.pivot_table( values=["D", "E"], index=["A", "B"], @@ -777,7 +772,7 @@ def test_dataframe_pivot_table_simple(data, aggfunc, fill_value): aggfunc=aggfunc, fill_value=fill_value, ) - cdf = cudf.DataFrame(data) + cdf = cudf.DataFrame.from_pandas(pdf) actual = cdf.pivot_table( values=["D", "E"], index=["A", "B"], @@ -840,3 +835,20 @@ def test_crosstab_simple(): expected = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) actual = cudf.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) assert_eq(expected, actual, check_dtype=False) + + +@pytest.mark.parametrize("index", [["ix"], ["ix", "foo"]]) +@pytest.mark.parametrize("columns", [["col"], ["col", "baz"]]) +def test_pivot_list_like_index_columns(index, columns): + data = { + "bar": ["x", "y", "z", "w"], + "col": ["a", "b", "a", "b"], + "foo": [1, 2, 3, 4], + "ix": [1, 1, 2, 2], + "baz": [0, 0, 0, 0], + } + pd_df = pd.DataFrame(data) + cudf_df = cudf.DataFrame(data) + result = cudf_df.pivot(columns=columns, index=index) + expected = pd_df.pivot(columns=columns, index=index) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 0958b68084d..afb82f75bcf 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -69,6 +69,7 @@ def s3_base(endpoint_ip, endpoint_port): # with an S3 endpoint on localhost endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/" + os.environ["AWS_ENDPOINT_URL"] = endpoint_uri server = ThreadedMotoServer(ip_address=endpoint_ip, port=endpoint_port) server.start() @@ -105,6 +106,15 @@ def s3_context(s3_base, bucket, files=None): pass +@pytest.fixture( + params=[True, False], + ids=["kvikio=ON", "kvikio=OFF"], +) +def kvikio_remote_io(request): + with cudf.option_context("kvikio_remote_io", request.param): + yield request.param + + @pytest.fixture def pdf(scope="module"): df = pd.DataFrame() @@ -193,6 +203,7 @@ def test_write_csv(s3_base, s3so, pdf, chunksize): def test_read_parquet( s3_base, s3so, + kvikio_remote_io, pdf, bytes_per_thread, columns, diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 0b892a51895..b50ed04427f 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd import pytest +from packaging import version import cudf from cudf.testing import _utils as utils, assert_eq @@ -149,13 +150,19 @@ def test_serialize(df, to_host): def test_serialize_dtype_error_checking(): dtype = cudf.IntervalDtype("float", "right") - header, frames = dtype.serialize() - with pytest.raises(AssertionError): - # Invalid number of frames - type(dtype).deserialize(header, [None] * (header["frame_count"] + 1)) + # Must call device_serialize (not serialize) to ensure that the type metadata is + # encoded in the header. + header, frames = dtype.device_serialize() with pytest.raises(AssertionError): # mismatching class cudf.StructDtype.deserialize(header, frames) + # The is-cuda flag list length must match the number of frames + header["is-cuda"] = [False] + with pytest.raises(AssertionError): + # Invalid number of frames + type(dtype).deserialize( + header, [np.zeros(1)] * (header["frame_count"] + 1) + ) def test_serialize_dataframe(): @@ -170,11 +177,15 @@ def test_serialize_dataframe(): def test_serialize_dataframe_with_index(): - df = cudf.DataFrame() - df["a"] = np.arange(100) - df["b"] = np.random.random(100) - df["c"] = pd.Categorical( - ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "a": np.arange(100), + "b": rng.random(100), + "c": pd.Categorical( + ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] + ), + } ) df = df.sort_values("b") outdf = cudf.DataFrame.deserialize(*df.serialize()) @@ -200,11 +211,12 @@ def test_serialize_generic_index(): def test_serialize_multi_index(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { "a": [4, 17, 4, 9, 5], "b": [1, 4, 4, 3, 2], - "x": np.random.normal(size=5), + "x": rng.normal(size=5), } ) gdf = cudf.DataFrame.from_pandas(pdf) @@ -218,7 +230,8 @@ def test_serialize_multi_index(): def test_serialize_masked_series(): nelem = 50 - data = np.random.random(nelem) + rng = np.random.default_rng(seed=0) + data = rng.random(nelem) mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] null_count = utils.count_zero(bitmask) @@ -229,10 +242,14 @@ def test_serialize_masked_series(): def test_serialize_groupby_df(): - df = cudf.DataFrame() - df["key_1"] = np.random.randint(0, 20, 100) - df["key_2"] = np.random.randint(0, 20, 100) - df["val"] = np.arange(100, dtype=np.float32) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "key_1": rng.integers(0, 20, 100), + "key_2": rng.integers(0, 20, 100), + "val": np.arange(100, dtype=np.float32), + } + ) gb = df.groupby(["key_1", "key_2"], sort=True) outgb = gb.deserialize(*gb.serialize()) expect = gb.mean() @@ -241,9 +258,9 @@ def test_serialize_groupby_df(): def test_serialize_groupby_external(): - df = cudf.DataFrame() - df["val"] = np.arange(100, dtype=np.float32) - gb = df.groupby(cudf.Series(np.random.randint(0, 20, 100))) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame({"val": np.arange(100, dtype=np.float32)}) + gb = df.groupby(cudf.Series(rng.integers(0, 20, 100))) outgb = gb.deserialize(*gb.serialize()) expect = gb.mean() got = outgb.mean() @@ -262,7 +279,8 @@ def test_serialize_groupby_level(): def test_serialize_groupby_sr(): - sr = cudf.Series(np.random.randint(0, 20, 100)) + rng = np.random.default_rng(seed=0) + sr = cudf.Series(rng.integers(0, 20, 100)) gb = sr.groupby(sr // 2) outgb = gb.deserialize(*gb.serialize()) got = gb.mean() @@ -271,9 +289,10 @@ def test_serialize_groupby_sr(): def test_serialize_datetime(): + rng = np.random.default_rng(seed=0) # Make frame with datetime column df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) ts = np.arange(0, len(df), dtype=np.dtype("datetime64[ms]")) df["timestamp"] = ts @@ -285,9 +304,10 @@ def test_serialize_datetime(): def test_serialize_string(): + rng = np.random.default_rng(seed=0) # Make frame with string column df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=5), "y": np.random.normal(size=5)} + {"x": rng.integers(0, 5, size=5), "y": rng.normal(size=5)} ) str_data = ["a", "bc", "def", "ghij", "klmno"] df["timestamp"] = str_data @@ -369,6 +389,10 @@ def test_serialize_string_check_buffer_sizes(): assert expect == got +@pytest.mark.skipif( + version.parse(np.__version__) < version.parse("2.0.0"), + reason="The serialization of numpy 2.0 types is incompatible with numpy 1.x", +) def test_deserialize_cudf_23_12(datadir): fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl" diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a24002dc38e..7f0a4902ed1 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -519,13 +519,13 @@ def test_series_factorize_sort(data, sort): @pytest.mark.parametrize("nulls", ["none", "some"]) def test_series_datetime_value_counts(data, nulls, normalize, dropna): psr = data.copy() - + rng = np.random.default_rng(seed=0) if len(data) > 0: if nulls == "one": - p = np.random.randint(0, len(data)) + p = rng.integers(0, len(data)) psr[p] = None elif nulls == "some": - p = np.random.randint(0, len(data), 2) + p = rng.integers(0, len(data), 2) psr[p] = None gsr = cudf.from_pandas(psr) @@ -546,10 +546,10 @@ def test_series_datetime_value_counts(data, nulls, normalize, dropna): @pytest.mark.parametrize("num_elements", [10, 100, 1000]) def test_categorical_value_counts(dropna, normalize, num_elements): # create categorical series - np.random.seed(12) + rng = np.random.default_rng(seed=12) pd_cat = pd.Categorical( pd.Series( - np.random.choice(list(ascii_letters + digits), num_elements), + rng.choice(list(ascii_letters + digits), num_elements), dtype="category", ) ) @@ -586,8 +586,9 @@ def test_categorical_value_counts(dropna, normalize, num_elements): @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("normalize", [True, False]) def test_series_value_counts(dropna, normalize): + rng = np.random.default_rng(seed=0) for size in [10**x for x in range(5)]: - arr = np.random.randint(low=-1, high=10, size=size) + arr = rng.integers(low=-1, high=10, size=size) mask = arr != -1 sr = cudf.Series.from_masked_array( arr, cudf.Series(mask)._column.as_mask() @@ -714,8 +715,8 @@ def test_series_mode(gs, dropna): @pytest.mark.parametrize( "arr", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat([-0.6459412758761901], 100), np.repeat(np.nan, 100), @@ -731,12 +732,12 @@ def test_series_round(arr, decimals): expected = pser.round(decimals) assert_eq(result, expected) - + rng = np.random.default_rng(seed=0) # with nulls, maintaining existing null mask arr = arr.astype("float64") # for pandas nulls - arr.ravel()[ - np.random.choice(arr.shape[0], arr.shape[0] // 2, replace=False) - ] = np.nan + arr.ravel()[rng.choice(arr.shape[0], arr.shape[0] // 2, replace=False)] = ( + np.nan + ) pser = pd.Series(arr) ser = cudf.Series(arr) @@ -1726,7 +1727,7 @@ def test_series_truncate_datetimeindex(): [], [0, 12, 14], [0, 14, 12, 12, 3, 10, 12, 14], - np.random.randint(-100, 100, 200), + np.random.default_rng(seed=0).integers(-100, 100, 200), pd.Series([0.0, 1.0, None, 10.0]), [None, None, None, None], [np.nan, None, -1, 2, 3], @@ -1735,7 +1736,7 @@ def test_series_truncate_datetimeindex(): @pytest.mark.parametrize( "values", [ - np.random.randint(-100, 100, 10), + np.random.default_rng(seed=0).integers(-100, 100, 10), [], [np.nan, None, -1, 2, 3], [1.0, 12.0, None, None, 120], @@ -1746,7 +1747,8 @@ def test_series_truncate_datetimeindex(): ], ) def test_isin_numeric(data, values): - index = np.random.randint(0, 100, len(data)) + rng = np.random.default_rng(seed=0) + index = rng.integers(0, 100, len(data)) psr = pd.Series(data, index=index) gsr = cudf.Series.from_pandas(psr, nan_as_null=False) @@ -1943,8 +1945,9 @@ def test_diff_many_dtypes(data): @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) @pytest.mark.parametrize("series_bins", [True, False]) def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): - data = np.random.randint(0, 100, num_rows).astype(dtype) - bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) + rng = np.random.default_rng(seed=0) + data = rng.integers(0, 100, num_rows).astype(dtype) + bins = np.unique(np.sort(rng.integers(2, 95, num_bins).astype(dtype))) s = cudf.Series(data) if series_bins: s_bins = cudf.Series(bins) @@ -1957,7 +1960,8 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): def test_series_digitize_invalid_bins(): - s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") + rng = np.random.default_rng(seed=0) + s = cudf.Series(rng.integers(0, 30, 80), dtype="int32") bins = cudf.Series([2, None, None, 50, 90], dtype="int32") with pytest.raises( @@ -2038,7 +2042,8 @@ def test_default_float_bitwidth_construction(default_float_bitwidth, data): def test_series_ordered_dedup(): # part of https://github.com/rapidsai/cudf/issues/11486 - sr = cudf.Series(np.random.randint(0, 100, 1000)) + rng = np.random.default_rng(seed=0) + sr = cudf.Series(rng.integers(0, 100, 1000)) # pandas unique() preserves order expect = pd.Series(sr.to_pandas().unique()) got = cudf.Series._from_column(sr._column.unique()) diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index 3d8b6a79d2a..db1de7d0cf4 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -47,8 +47,8 @@ def test_series_map_callable_numeric_basic(): @pytest.mark.parametrize("nelem", list(product([2, 10, 100, 1000]))) def test_series_map_callable_numeric_random(nelem): # Generate data - np.random.seed(0) - data = np.random.random(nelem) * 100 + rng = np.random.default_rng(seed=0) + data = rng.random(nelem) * 100 sr = Series(data) pdsr = pd.Series(data) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 2cf2259d9ec..7e5ce713c7e 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -34,10 +34,10 @@ "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args)) ) def test_dataframe_sort_values(nelem, dtype): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() - df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype) - df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype) + df["a"] = aa = (100 * rng.random(nelem)).astype(dtype) + df["b"] = bb = (100 * rng.random(nelem)).astype(dtype) sorted_df = df.sort_values(by="a") # Check sorted_index = np.argsort(aa, kind="mergesort") @@ -85,9 +85,9 @@ def test_series_sort_values_ignore_index(ignore_index): "nelem,sliceobj", list(product([10, 100], sort_slice_args)) ) def test_dataframe_sort_values_sliced(nelem, sliceobj): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame() - df["a"] = np.random.random(nelem) + df["a"] = rng.random(nelem) expect = df[sliceobj]["a"].sort_values() gdf = DataFrame.from_pandas(df) @@ -100,8 +100,8 @@ def test_dataframe_sort_values_sliced(nelem, sliceobj): list(product(sort_nelem_args, sort_dtype_args, [True, False])), ) def test_series_argsort(nelem, dtype, asc): - np.random.seed(0) - sr = Series((100 * np.random.random(nelem)).astype(dtype)) + rng = np.random.default_rng(seed=0) + sr = Series((100 * rng.random(nelem)).astype(dtype)) res = sr.argsort(ascending=asc) if asc: @@ -116,8 +116,8 @@ def test_series_argsort(nelem, dtype, asc): "nelem,asc", list(product(sort_nelem_args, [True, False])) ) def test_series_sort_index(nelem, asc): - np.random.seed(0) - sr = Series(100 * np.random.random(nelem)) + rng = np.random.default_rng(seed=0) + sr = Series(100 * rng.random(nelem)) psr = sr.to_pandas() expected = psr.sort_index(ascending=asc) @@ -167,9 +167,9 @@ def test_series_nsmallest(data, n): @pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) @pytest.mark.parametrize("columns", ["a", ["b", "a"]]) def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): - np.random.seed(0) - aa = np.random.random(nelem) - bb = np.random.random(nelem) + rng = np.random.default_rng(seed=0) + aa = rng.random(nelem) + bb = rng.random(nelem) df = DataFrame({"a": aa, "b": bb}) pdf = df.to_pandas() @@ -181,10 +181,10 @@ def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): ) def test_dataframe_nlargest_sliced(counts, sliceobj): nelem, n = counts - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame() - df["a"] = np.random.random(nelem) - df["b"] = np.random.random(nelem) + df["a"] = rng.random(nelem) + df["b"] = rng.random(nelem) expect = df[sliceobj].nlargest(n, "a") gdf = DataFrame.from_pandas(df) @@ -197,10 +197,10 @@ def test_dataframe_nlargest_sliced(counts, sliceobj): ) def test_dataframe_nsmallest_sliced(counts, sliceobj): nelem, n = counts - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame() - df["a"] = np.random.random(nelem) - df["b"] = np.random.random(nelem) + df["a"] = rng.random(nelem) + df["b"] = rng.random(nelem) expect = df[sliceobj].nsmallest(n, "a") gdf = DataFrame.from_pandas(df) @@ -216,13 +216,13 @@ def test_dataframe_nsmallest_sliced(counts, sliceobj): def test_dataframe_multi_column( num_cols, num_rows, dtype, ascending, na_position ): - np.random.seed(0) + rng = np.random.default_rng(seed=0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(5): colname = string.ascii_lowercase[i] - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) pdf[colname] = data gdf = DataFrame.from_pandas(pdf) @@ -244,17 +244,17 @@ def test_dataframe_multi_column( def test_dataframe_multi_column_nulls( num_cols, num_rows, dtype, nulls, ascending, na_position ): - np.random.seed(0) + rng = np.random.default_rng(seed=0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(3): colname = string.ascii_lowercase[i] - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.array([], dtype="int64") if num_rows > 0: - idx = np.random.choice( + idx = rng.choice( num_rows, size=int(num_rows / 4), replace=False ) data[idx] = np.nan @@ -295,8 +295,8 @@ def test_dataframe_multi_column_nulls_multiple_ascending( @pytest.mark.parametrize("nelem", [1, 100]) def test_series_nlargest_nelem(nelem): - np.random.seed(0) - elems = np.random.random(nelem) + rng = np.random.default_rng(seed=0) + elems = rng.random(nelem) gds = Series(elems).nlargest(nelem) pds = pd.Series(elems).nlargest(nelem) @@ -308,11 +308,14 @@ def test_series_nlargest_nelem(nelem): @pytest.mark.parametrize("keep", [True, False]) def test_dataframe_scatter_by_map(map_size, nelem, keep): strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] - np.random.seed(0) - df = DataFrame() - df["a"] = np.random.choice(strlist[:map_size], nelem) - df["b"] = np.random.uniform(low=0, high=map_size, size=nelem) - df["c"] = np.random.randint(map_size, size=nelem) + rng = np.random.default_rng(seed=0) + df = DataFrame( + { + "a": rng.choice(strlist[:map_size], nelem), + "b": rng.uniform(low=0, high=map_size, size=nelem), + "c": rng.integers(map_size, size=nelem), + } + ) df["d"] = df["a"].astype("category") def _check_scatter_by_map(dfs, col): @@ -381,10 +384,10 @@ def _check_scatter_by_map(dfs, col): "kind", ["quicksort", "mergesort", "heapsort", "stable"] ) def test_dataframe_sort_values_kind(nelem, dtype, kind): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() - df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype) - df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype) + df["a"] = aa = (100 * rng.random(nelem)).astype(dtype) + df["b"] = bb = (100 * rng.random(nelem)).astype(dtype) with expect_warning_if(kind != "quicksort", UserWarning): sorted_df = df.sort_values(by="a", kind=kind) # Check diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py index 3248e7f72c0..8b68ae6480b 100644 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ b/python/cudf/cudf/tests/test_sparse_df.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import numpy as np @@ -6,7 +6,8 @@ def test_to_dense_array(): - data = np.random.random(8) + rng = np.random.default_rng(seed=0) + data = rng.random(8) mask = np.asarray([0b11010110]).astype(np.byte) sr = Series.from_masked_array(data=data, mask=mask, null_count=3) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index f952cea07f8..27de0ed42e5 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -24,8 +24,8 @@ @pytest.mark.parametrize("dtype", params_dtypes) @pytest.mark.parametrize("skipna", [True, False]) def test_series_reductions(method, dtype, skipna): - np.random.seed(0) - arr = np.random.random(100) + rng = np.random.default_rng(seed=0) + arr = rng.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = arr > 10 @@ -56,8 +56,8 @@ def call_test(sr, skipna): def test_series_reductions_concurrency(method): e = ThreadPoolExecutor(10) - np.random.seed(0) - srs = [cudf.Series(np.random.random(10000)) for _ in range(1)] + rng = np.random.default_rng(seed=0) + srs = [cudf.Series(rng.random(10000)) for _ in range(1)] def call_test(sr): fn = getattr(sr, method) @@ -74,8 +74,8 @@ def f(sr): @pytest.mark.parametrize("ddof", range(3)) def test_series_std(ddof): - np.random.seed(0) - arr = np.random.random(100) - 0.5 + rng = np.random.default_rng(seed=0) + arr = rng.random(100) - 0.5 sr = cudf.Series(arr) pd = sr.to_pandas() got = sr.std(ddof=ddof) @@ -84,8 +84,9 @@ def test_series_std(ddof): def test_series_unique(): + rng = np.random.default_rng(seed=0) for size in [10**x for x in range(5)]: - arr = np.random.randint(low=-1, high=10, size=size) + arr = rng.integers(low=-1, high=10, size=size) mask = arr != -1 sr = cudf.Series(arr) sr[~mask] = None @@ -129,7 +130,8 @@ def test_series_nunique(nan_as_null, dropna): def test_series_scale(): - arr = pd.Series(np.random.randint(low=-10, high=10, size=100)) + rng = np.random.default_rng(seed=0) + arr = pd.Series(rng.integers(low=-10, high=10, size=100)) sr = cudf.Series(arr) vmin = arr.min() @@ -229,8 +231,8 @@ def test_misc_quantiles(data, q): @pytest.mark.parametrize( "data", [ - {"data": np.random.normal(-100, 100, 1000)}, - {"data": np.random.randint(-50, 50, 1000)}, + {"data": np.random.default_rng(seed=0).normal(-100, 100, 1000)}, + {"data": np.random.default_rng(seed=0).integers(-50, 50, 1000)}, {"data": (np.zeros(100))}, {"data": np.repeat(np.nan, 100)}, {"data": np.array([1.123, 2.343, np.nan, 0.0])}, @@ -280,8 +282,8 @@ def test_kurt_skew_error(op): @pytest.mark.parametrize( "data", [ - cudf.Series(np.random.normal(-100, 100, 1000)), - cudf.Series(np.random.randint(-50, 50, 1000)), + cudf.Series(np.random.default_rng(seed=0).normal(-100, 100, 1000)), + cudf.Series(np.random.default_rng(seed=0).integers(-50, 50, 1000)), cudf.Series(np.zeros(100)), cudf.Series(np.repeat(np.nan, 100)), cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), @@ -311,8 +313,8 @@ def test_skew_series(data, null_flag, numeric_only): @pytest.mark.parametrize("dtype", params_dtypes) @pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100]) def test_series_median(dtype, num_na): - np.random.seed(0) - arr = np.random.random(100) + rng = np.random.default_rng(seed=0) + arr = rng.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = np.arange(100) >= num_na @@ -344,8 +346,8 @@ def test_series_median(dtype, num_na): @pytest.mark.parametrize( "data", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.array([1.123, 2.343, np.nan, 0.0]), np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]), @@ -379,8 +381,8 @@ def test_series_pct_change(data, periods, fill_method): @pytest.mark.parametrize( "data1", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), @@ -393,8 +395,8 @@ def test_series_pct_change(data, periods, fill_method): @pytest.mark.parametrize( "data2", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), @@ -423,8 +425,8 @@ def test_cov1d(data1, data2): @pytest.mark.parametrize( "data1", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), @@ -437,8 +439,8 @@ def test_cov1d(data1, data2): @pytest.mark.parametrize( "data2", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index cc88cc79769..e25f99d7bee 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -36,6 +36,7 @@ idx_list = [None, [10, 11, 12, 13, 14]] idx_id_list = ["None_index", "Set_index"] +rng = np.random.default_rng(seed=0) def raise_builder(flags, exceptions): @@ -132,9 +133,14 @@ def test_string_get_item(ps_gs, item): np.array([False] * 5), cupy.asarray(np.array([True] * 5)), cupy.asarray(np.array([False] * 5)), - np.random.randint(0, 2, 5).astype("bool").tolist(), - np.random.randint(0, 2, 5).astype("bool"), - cupy.asarray(np.random.randint(0, 2, 5).astype("bool")), + np.random.default_rng(seed=0) + .integers(0, 2, 5) + .astype("bool") + .tolist(), + np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool"), + cupy.asarray( + np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool") + ), ], ) def test_string_bool_mask(ps_gs, item): @@ -1078,7 +1084,8 @@ def test_string_set_scalar(scalar): def test_string_index(): - pdf = pd.DataFrame(np.random.rand(5, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(5, 5))) gdf = cudf.DataFrame.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] pdf.index = stringIndex @@ -1899,6 +1906,26 @@ def test_string_findall(pat, flags): assert_eq(expected, actual) +@pytest.mark.parametrize( + "pat, flags, pos", + [ + ("Monkey", 0, [-1, 0, -1, -1]), + ("on", 0, [2, 1, -1, 1]), + ("bit", 0, [-1, -1, 3, -1]), + ("on$", 0, [2, -1, -1, -1]), + ("on$", re.MULTILINE, [2, -1, -1, 1]), + ("o.*k", re.DOTALL, [-1, 1, -1, 1]), + ], +) +def test_string_find_re(pat, flags, pos): + test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] + gs = cudf.Series(test_data) + + expected = pd.Series(pos, dtype=np.int32) + actual = gs.str.find_re(pat, flags) + assert_eq(expected, actual) + + def test_string_replace_multi(): ps = pd.Series(["hello", "goodbye"]) gs = cudf.Series(["hello", "goodbye"]) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index e91edc9eec6..b85943626a6 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -50,10 +50,14 @@ def test_struct_for_field(key, expect): assert_eq(expect, got) -@pytest.mark.parametrize("input_obj", [[{"a": 1, "b": cudf.NA, "c": 3}]]) -def test_series_construction_with_nulls(input_obj): - expect = pa.array(input_obj, from_pandas=True) - got = cudf.Series(input_obj).to_arrow() +def test_series_construction_with_nulls(): + fields = [ + pa.array([1], type=pa.int64()), + pa.array([None], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + expect = pa.StructArray.from_arrays(fields, ["a", "b", "c"]) + got = cudf.Series(expect).to_arrow() assert expect == got @@ -75,7 +79,7 @@ def test_series_construction_with_nulls(input_obj): ) def test_serialize_struct_dtype(fields): dtype = cudf.StructDtype(fields) - recreated = dtype.__class__.deserialize(*dtype.serialize()) + recreated = dtype.__class__.device_deserialize(*dtype.device_serialize()) assert recreated == dtype diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py index 88938457545..1305022d7fa 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/test_transform.py @@ -24,8 +24,8 @@ def _generic_function(a): ) def test_apply_python_lambda(dtype, udf, testfunc): size = 500 - - lhs_arr = np.random.random(size).astype(dtype) + rng = np.random.default_rng(seed=0) + lhs_arr = rng.random(size).astype(dtype) lhs_ser = Series(lhs_arr) out_ser = lhs_ser.apply(udf) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 5f5d79c1dce..b714beb0069 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -17,7 +17,8 @@ @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) def test_series_abs(dtype): - arr = (np.random.random(1000) * 100).astype(dtype) + rng = np.random.default_rng(seed=0) + arr = (rng.random(1000) * 100).astype(dtype) sr = Series(arr) np.testing.assert_equal(sr.abs().to_numpy(), np.abs(arr)) np.testing.assert_equal(abs(sr).to_numpy(), abs(arr)) @@ -25,22 +26,24 @@ def test_series_abs(dtype): @pytest.mark.parametrize("dtype", utils.INTEGER_TYPES) def test_series_invert(dtype): - arr = (np.random.random(1000) * 100).astype(dtype) + rng = np.random.default_rng(seed=0) + arr = (rng.random(1000) * 100).astype(dtype) sr = Series(arr) np.testing.assert_equal((~sr).to_numpy(), np.invert(arr)) np.testing.assert_equal((~sr).to_numpy(), ~arr) def test_series_neg(): - arr = np.random.random(100) * 100 + rng = np.random.default_rng(seed=0) + arr = rng.random(100) * 100 sr = Series(arr) np.testing.assert_equal((-sr).to_numpy(), -arr) @pytest.mark.parametrize("mth", ["min", "max", "sum", "product"]) def test_series_pandas_methods(mth): - np.random.seed(0) - arr = (1 + np.random.random(5) * 100).astype(np.int64) + rng = np.random.default_rng(seed=0) + arr = (1 + rng.random(5) * 100).astype(np.int64) sr = Series(arr) psr = pd.Series(arr) np.testing.assert_equal(getattr(sr, mth)(), getattr(psr, mth)()) diff --git a/python/cudf/cudf/tests/test_unique.py b/python/cudf/cudf/tests/test_unique.py index 699b3340521..9a1c3b213b8 100644 --- a/python/cudf/cudf/tests/test_unique.py +++ b/python/cudf/cudf/tests/test_unique.py @@ -12,9 +12,9 @@ @pytest.fixture def df(): df = cudf.DataFrame() - np.random.seed(0) + rng = np.random.default_rng(seed=0) - arr = np.random.randint(2, size=10, dtype=np.int64) + arr = rng.integers(2, size=10, dtype=np.int64) df["foo"] = arr df["bar"] = cudf.Series([pd.Timestamp(x) for x in arr]) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 997ca357986..47e541fdcef 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,68 +882,48 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash(): +def test_minhash_permuted(): strings = cudf.Series(["this is my", "favorite book", None, ""]) + params = cudf.Series([1, 2, 3], dtype=np.uint32) expected = cudf.Series( [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - None, - cudf.Series([0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32), + cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32), None, cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash(seeds=seeds, width=5) + actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) - expected = cudf.Series( - [ - cudf.Series([3232308021562742685], dtype=np.uint64), - cudf.Series([23008204270530356], dtype=np.uint64), - None, - cudf.Series([0], dtype=np.uint64), - ] - ) - actual = strings.str.minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + params = cudf.Series([1, 2, 3], dtype=np.uint64) expected = cudf.Series( [ cudf.Series( - [7082801294247314046, 185949556058924788, 167570629329462454], + [105531920695060180, 172452388517576009, 316595762085180524], dtype=np.uint64, ), cudf.Series( - [382665377781028452, 86243762733551437, 7688750597953083512], + [35713768479063122, 71427536958126236, 58787297728258212], dtype=np.uint64, ), None, cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64(seeds=seeds, width=5) + actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash(seeds="a") + strings.str.minhash_permuted(1, a="a", b="b", width=7) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash_permuted(1, a=params, b=params, width=6) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash64_permuted(1, a=params, b=params, width=8) def test_word_minhash(): diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index b0788bcc0fc..57bf08e6eec 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -364,13 +364,19 @@ def min_column_type(x, expected_type): if x.null_count == len(x): return x.dtype - if x.dtype.kind == "f": - return get_min_float_dtype(x) - - elif cudf.dtype(expected_type).kind in "iu": - max_bound_dtype = np.min_scalar_type(x.max()) - min_bound_dtype = np.min_scalar_type(x.min()) + min_value, max_value = x.min(), x.max() + either_is_inf = np.isinf(min_value) or np.isinf(max_value) + expected_type = cudf.dtype(expected_type) + if not either_is_inf and expected_type.kind in "i": + max_bound_dtype = min_signed_type(max_value) + min_bound_dtype = min_signed_type(min_value) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) + elif not either_is_inf and expected_type.kind in "u": + max_bound_dtype = min_unsigned_type(max_value) + min_bound_dtype = min_unsigned_type(min_value) + result_type = np.promote_types(max_bound_dtype, min_bound_dtype) + elif x.dtype.kind == "f": + return get_min_float_dtype(x) else: result_type = x.dtype diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index babe4be2715..896a3809c67 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -69,10 +69,10 @@ def _get_space_util(bins, init_bins): return sum(_new_bin_length(len(b)) for b in bins) + 2 * init_bins -def _pick_initial_a_b(data, max_constant, init_bins): +def _pick_initial_a_b(data, max_constant, init_bins, rng): while True: - a = np.random.randint(2**12, 2**15) - b = np.random.randint(2**12, 2**15) + a = rng.integers(2**12, 2**15) + b = rng.integers(2**12, 2**15) bins = _make_bins(data, init_bins, a, b) score = _get_space_util(bins, init_bins) / len(data) @@ -86,18 +86,18 @@ def _pick_initial_a_b(data, max_constant, init_bins): return bins, a, b -def _find_hash_for_internal(hash_bin): +def _find_hash_for_internal(hash_bin, rng): if not hash_bin: return [[], 0, 0] new_length = _new_bin_length(len(hash_bin)) while True: - a = np.random.randint( + a = rng.integers( A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH, ) - b = np.random.randint( + b = rng.integers( B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH ) bins = _make_bins(hash_bin, new_length, a, b) @@ -108,11 +108,11 @@ def _find_hash_for_internal(hash_bin): return bins, a, b -def _perfect_hash(integers, max_constant): +def _perfect_hash(integers, max_constant, rng): num_top_level_bins = len(integers) // 4 init_bins, init_a, init_b = _pick_initial_a_b( - integers, max_constant, num_top_level_bins + integers, max_constant, num_top_level_bins, rng ) flattened_bins = [] @@ -127,7 +127,7 @@ def _perfect_hash(integers, max_constant): for i, b in enumerate(init_bins): if i % 500 == 0: print(f"Processing bin {i} / {len(init_bins)} of size = {len(b)}") - internal_table, coeff_a, coeff_b = _find_hash_for_internal(b) + internal_table, coeff_a, coeff_b = _find_hash_for_internal(b, rng) bin_length = len(internal_table) max_bin_length = max(bin_length, max_bin_length) internal_table_coeffs[i] = ( @@ -245,7 +245,7 @@ def hash_vocab( """ Write the vocab vocabulary hashtable to the output_path """ - np.random.seed(1243342) + rng = np.random.default_rng(seed=1243342) vocab = _load_vocab_dict(vocab_path) keys = list(map(_sdbm_hash, vocab.keys())) @@ -264,7 +264,7 @@ def hash_vocab( hash_table, inner_table_coeffs, offsets_into_ht, - ) = _perfect_hash(keys, 10) + ) = _perfect_hash(keys, 10, rng) _pack_keys_and_values(hash_table, hashed_vocab) _store_func( diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index d636f36f282..86ed749772f 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -16,6 +16,7 @@ import pandas as pd from fsspec.core import expand_paths_if_needed, get_fs_token_paths +import cudf from cudf.api.types import is_list_like from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial @@ -209,6 +210,11 @@ ----- {remote_data_sources} +- Setting the cudf option `io.parquet.low_memory=True` will result in the chunked + low memory parquet reader being used. This can make it easier to read large + parquet datasets on systems with limited GPU memory. See all `available options + `_. + Examples -------- >>> import cudf @@ -757,9 +763,14 @@ Notes ----- -When `engine='auto'`, and `line=False`, the `pandas` json -reader will be used. To override the selection, please -use `engine='cudf'`. +- When `engine='auto'`, and `line=False`, the `pandas` json + reader will be used. To override the selection, please + use `engine='cudf'`. + +- Setting the cudf option `io.json.low_memory=True` will result in the chunked + low memory json reader being used. This can make it easier to read large + json datasets on systems with limited GPU memory. See all `available options + `_. See Also -------- @@ -1624,6 +1635,16 @@ def _maybe_expand_directories(paths, glob_pattern, fs): return expanded_paths +def _use_kvikio_remote_io(fs) -> bool: + """Whether `kvikio_remote_io` is enabled and `fs` refers to a S3 file""" + + try: + from s3fs.core import S3FileSystem + except ImportError: + return False + return cudf.get_option("kvikio_remote_io") and isinstance(fs, S3FileSystem) + + @doc_get_reader_filepath_or_buffer() def get_reader_filepath_or_buffer( path_or_data, @@ -1649,17 +1670,17 @@ def get_reader_filepath_or_buffer( ) ] if not input_sources: - raise ValueError("Empty input source list: {input_sources}.") + raise ValueError(f"Empty input source list: {input_sources}.") filepaths_or_buffers = [] string_paths = [isinstance(source, str) for source in input_sources] if any(string_paths): - # Sources are all strings. Thes strings are typically + # Sources are all strings. The strings are typically # file paths, but they may also be raw text strings. # Don't allow a mix of source types if not all(string_paths): - raise ValueError("Invalid input source list: {input_sources}.") + raise ValueError(f"Invalid input source list: {input_sources}.") # Make sure we define a filesystem (if possible) paths = input_sources @@ -1712,11 +1733,17 @@ def get_reader_filepath_or_buffer( raise FileNotFoundError( f"{input_sources} could not be resolved to any files" ) - filepaths_or_buffers = _prefetch_remote_buffers( - paths, - fs, - **(prefetch_options or {}), - ) + + # If `kvikio_remote_io` is enabled and `fs` refers to a S3 file, + # we create S3 URLs and let them pass-through to libcudf. + if _use_kvikio_remote_io(fs): + filepaths_or_buffers = [f"s3://{fpath}" for fpath in paths] + else: + filepaths_or_buffers = _prefetch_remote_buffers( + paths, + fs, + **(prefetch_options or {}), + ) else: raw_text_input = True diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 7347ec7866a..294253cd119 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -6,6 +6,7 @@ import os import traceback import warnings +from typing import Any import numpy as np import pandas as pd @@ -403,3 +404,40 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value): if result_mask is not None: result_col = result_col.set_mask(result_mask.as_mask()) return result_col + + +def _datetime_timedelta_find_and_replace( + original_column: "cudf.core.column.DatetimeColumn" + | "cudf.core.column.TimeDeltaColumn", + to_replace: Any, + replacement: Any, + all_nan: bool = False, +) -> "cudf.core.column.DatetimeColumn" | "cudf.core.column.TimeDeltaColumn": + """ + This is an internal utility to find and replace values in a datetime or + timedelta column. It is used by the `find_and_replace` method of + `DatetimeColumn` and `TimeDeltaColumn`. Centralizing the code in a single + as opposed to duplicating it in both classes. + """ + original_col_class = type(original_column) + if not isinstance(to_replace, original_col_class): + to_replace = cudf.core.column.as_column(to_replace) + if to_replace.can_cast_safely(original_column.dtype): + to_replace = to_replace.astype(original_column.dtype) + if not isinstance(replacement, original_col_class): + replacement = cudf.core.column.as_column(replacement) + if replacement.can_cast_safely(original_column.dtype): + replacement = replacement.astype(original_column.dtype) + if isinstance(to_replace, original_col_class): + to_replace = to_replace.as_numerical_column(dtype=np.dtype("int64")) + if isinstance(replacement, original_col_class): + replacement = replacement.as_numerical_column(dtype=np.dtype("int64")) + try: + result_col = ( + original_column.as_numerical_column(dtype=np.dtype("int64")) + .find_and_replace(to_replace, replacement, all_nan) + .astype(original_column.dtype) + ) + except TypeError: + result_col = original_column.copy(deep=True) + return result_col # type: ignore diff --git a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb index c7d39b78810..94904fd83d4 100644 --- a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb +++ b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb @@ -18,13 +18,13 @@ "import numpy as np\n", "import pandas as pd\n", "\n", - "np.random.seed(0)\n", + "rng = np.random.default_rng(seed=0)\n", "\n", "num_rows = 25_000_000\n", "num_columns = 12\n", "\n", "# Create a DataFrame with random data\n", - "df = pd.DataFrame(np.random.randint(0, 100, size=(num_rows, num_columns)),\n", + "df = pd.DataFrame(rng.integers(0, 100, size=(num_rows, num_columns)),\n", " columns=[f'Column_{i}' for i in range(1, num_columns + 1)])" ] }, diff --git a/python/cudf/cudf_pandas_tests/pytest.ini b/python/cudf/cudf_pandas_tests/pytest.ini new file mode 100644 index 00000000000..46e2448ea24 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/pytest.ini @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +# Note, this config file overrides the default "cudf" test config in +# ../pyproject.toml We do so deliberately because we have different +# treatment of markers and warnings +[pytest] +addopts = --tb=native --strict-config --strict-markers +empty_parameter_set_mark = fail_at_collect +xfail_strict = true diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 2bbed40e34e..4473a0e6f12 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import collections +import contextlib import copy import datetime import operator @@ -11,6 +12,7 @@ import pickle import subprocess import tempfile +import time import types from io import BytesIO, StringIO @@ -21,13 +23,24 @@ import pyarrow as pa import pytest from nbconvert.preprocessors import ExecutePreprocessor -from numba import NumbaDeprecationWarning, vectorize +from numba import ( + NumbaDeprecationWarning, + __version__ as numba_version, + vectorize, +) +from packaging import version from pytz import utc -from cudf.core._compat import PANDAS_GE_220 +from rmm import RMMError + +from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220, PANDAS_VERSION from cudf.pandas import LOADED, Profiler from cudf.pandas.fast_slow_proxy import ( - ProxyFallbackError, + AttributeFallbackError, + FallbackError, + NotImplementedFallbackError, + OOMFallbackError, + TypeFallbackError, _Unusable, is_proxy_object, ) @@ -52,8 +65,6 @@ get_calendar, ) -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION - # Accelerated pandas has the real pandas and cudf modules as attributes pd = xpd._fsproxy_slow cudf = xpd._fsproxy_fast @@ -622,10 +633,6 @@ def test_array_function_series_fallback(series): tm.assert_equal(expect, got) -@pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) def test_timedeltaproperties(series): psr, sr = series psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]") @@ -685,10 +692,6 @@ def test_maintain_container_subclasses(multiindex): assert isinstance(got, xpd.core.indexes.frozen.FrozenList) -@pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas due to unsupported boxcar window type", -) def test_rolling_win_type(): pdf = pd.DataFrame(range(5)) df = xpd.DataFrame(range(5)) @@ -697,8 +700,14 @@ def test_rolling_win_type(): tm.assert_equal(result, expected) -@pytest.mark.skip( - reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009" +@pytest.mark.skipif( + version.parse(numba_version) < version.parse("0.59"), + reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009", +) +@pytest.mark.xfail( + version.parse(numba_version) >= version.parse("0.59") + and PANDAS_VERSION < version.parse("2.1"), + reason="numba.generated_jit removed in 0.59, requires pandas >= 2.1", ) def test_rolling_apply_numba_engine(): def weighted_mean(x): @@ -709,7 +718,12 @@ def weighted_mean(x): pdf = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) df = xpd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) - with pytest.warns(NumbaDeprecationWarning): + ctx = ( + contextlib.nullcontext() + if PANDAS_GE_210 + else pytest.warns(NumbaDeprecationWarning) + ) + with ctx: expect = pdf.rolling(2, method="table", min_periods=0).apply( weighted_mean, raw=True, engine="numba" ) @@ -1135,8 +1149,8 @@ def test_private_method_result_wrapped(): def test_numpy_var(): - np.random.seed(42) - data = np.random.rand(1000) + rng = np.random.default_rng(seed=42) + data = rng.random(1000) psr = pd.Series(data) sr = xpd.Series(data) @@ -1305,7 +1319,7 @@ def max_times_two(self): @pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION < version.parse("2.1"), reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0", ) def test_floordiv_array_vs_df(): @@ -1580,7 +1594,7 @@ def test_numpy_cupy_flatiter(series): @pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION < version.parse("2.1"), reason="pyarrow_numpy storage type was not supported in pandas-2.0.0", ) def test_arrow_string_arrays(): @@ -1750,5 +1764,124 @@ def add_one_ufunc(a): def test_fallback_raises_error(monkeypatch): with monkeypatch.context() as monkeycontext: monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") - with pytest.raises(ProxyFallbackError): + with pytest.raises(FallbackError): pd.Series(range(2)).astype(object) + + +def mock_mean_memory_error(self, *args, **kwargs): + raise MemoryError() + + +def mock_mean_rmm_error(self, *args, **kwargs): + raise RMMError(1, "error") + + +def mock_mean_not_impl_error(self, *args, **kwargs): + raise NotImplementedError() + + +def mock_mean_attr_error(self, *args, **kwargs): + raise AttributeError() + + +def mock_mean_type_error(self, *args, **kwargs): + raise TypeError() + + +@pytest.mark.parametrize( + "mock_mean, err", + [ + ( + mock_mean_memory_error, + OOMFallbackError, + ), + ( + mock_mean_rmm_error, + OOMFallbackError, + ), + ( + mock_mean_not_impl_error, + NotImplementedFallbackError, + ), + ( + mock_mean_attr_error, + AttributeFallbackError, + ), + ( + mock_mean_type_error, + TypeFallbackError, + ), + ], +) +def test_fallback_raises_specific_error( + monkeypatch, + mock_mean, + err, +): + with monkeypatch.context() as monkeycontext: + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", mock_mean) + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + s = xpd.Series([1, 2]) + with pytest.raises(err, match="Falling back to the slow path"): + assert s.mean() == 1.5 + + # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts. + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", cudf.Series.mean) + + +@pytest.mark.parametrize( + "attrs", + [ + "_exceptions", + "version", + "_print_versions", + "capitalize_first_letter", + "_validators", + "_decorators", + ], +) +def test_cudf_pandas_util_version(attrs): + if not PANDAS_GE_220 and attrs == "capitalize_first_letter": + assert not hasattr(pd.util, attrs) + else: + assert hasattr(pd.util, attrs) + + +def test_iteration_over_dataframe_dtypes_produces_proxy_objects(dataframe): + _, xdf = dataframe + xdf["b"] = xpd.IntervalIndex.from_arrays(xdf["a"], xdf["b"]) + xdf["a"] = xpd.Series([1, 1, 1, 2, 3], dtype="category") + dtype_series = xdf.dtypes + assert all(is_proxy_object(x) for x in dtype_series) + assert isinstance(dtype_series.iloc[0], xpd.CategoricalDtype) + assert isinstance(dtype_series.iloc[1], xpd.IntervalDtype) + + +def test_iter_doesnot_raise(monkeypatch): + s = xpd.Series([1, 2, 3]) + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + for _ in s: + pass + + +def test_dataframe_setitem_slowdown(): + # We are explicitly testing the slowdown of the setitem operation + df = xpd.DataFrame( + {"a": [1, 2, 3] * 100000, "b": [1, 2, 3] * 100000} + ).astype("float64") + df = xpd.DataFrame({"a": df["a"].repeat(1000), "b": df["b"].repeat(1000)}) + new_df = df + 1 + start_time = time.time() + df[df.columns] = new_df + end_time = time.time() + delta = int(end_time - start_time) + if delta > 5: + pytest.fail(f"Test took too long to run, runtime: {delta}") + + +def test_dataframe_setitem(): + df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).astype("float64") + new_df = df + 1 + df[df.columns] = new_df + tm.assert_equal(df, new_df) diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index a75a20a4681..63fd9601fc1 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -387,7 +387,8 @@ def test_dir_bound_method( ): """This test will fail because dir for bound methods is currently incorrect, but we have no way to fix it without materializing the slow - type, which is unnecessarily expensive.""" + type, which is unnecessarily expensive. + """ Fast, FastIntermediate = fast_and_intermediate_with_doc Slow, SlowIntermediate = slow_and_intermediate_with_doc diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py index 5b7bde06d1d..a5c29bd93a2 100644 --- a/python/cudf/cudf_pandas_tests/test_profiler.py +++ b/python/cudf/cudf_pandas_tests/test_profiler.py @@ -23,12 +23,12 @@ reason="function names change across versions of pandas, so making sure it only runs on latest version of pandas", ) def test_profiler(): - np.random.seed(42) + rng = np.random.default_rng(seed=42) with Profiler() as profiler: df = pd.DataFrame( { - "idx": np.random.randint(0, 10, 1000), - "data": np.random.rand(1000), + "idx": rng.integers(0, 10, 1000), + "data": rng.random(1000), } ) sums = df.groupby("idx").sum() @@ -58,7 +58,7 @@ def test_profiler(): calls = [ "pd.DataFrame", "", - "np.random.randint", + "rng.integers", "np.random.rand", 'df.groupby("idx").sum', 'df.sum()["data"]', diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index e4030ecf43d..2c7330d5ee6 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -76,6 +76,13 @@ files: - py_version - test_base - test_xgboost + test_catboost: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_catboost test_cuml: output: none includes: @@ -182,7 +189,7 @@ dependencies: common: - output_types: conda packages: - - cudf==24.10.* + - cudf==24.12.*,>=0.0.0a0 - pandas - pytest - pytest-xdist @@ -244,17 +251,25 @@ dependencies: - pip - pip: - xgboost>=2.0.1 + test_catboost: + common: + - output_types: conda + packages: + - numpy + - scipy + - scikit-learn + - catboost test_cuml: common: - output_types: conda packages: - - cuml==24.10.* + - cuml==24.12.*,>=0.0.0a0 - scikit-learn test_cugraph: common: - output_types: conda packages: - - cugraph==24.10.* + - cugraph==24.12.*,>=0.0.0a0 - networkx test_ibis: common: diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py new file mode 100644 index 00000000000..04cc69231fe --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py @@ -0,0 +1,129 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest +from catboost import CatBoostClassifier, CatBoostRegressor, Pool +from sklearn.datasets import make_classification, make_regression + +rng = np.random.default_rng(seed=42) + + +def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0): + if isinstance(expect, (tuple, list)): + assert len(expect) == len(got) + for e, g in zip(expect, got): + assert_catboost_equal(e, g, rtol, atol) + elif isinstance(expect, np.ndarray): + np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol) + elif isinstance(expect, pd.DataFrame): + pd.testing.assert_frame_equal(expect, got) + elif isinstance(expect, pd.Series): + pd.testing.assert_series_equal(expect, got) + else: + assert expect == got + + +pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal) + + +@pytest.fixture +def regression_data(): + X, y = make_regression(n_samples=100, n_features=10, random_state=42) + return pd.DataFrame(X), pd.Series(y) + + +@pytest.fixture +def classification_data(): + X, y = make_classification( + n_samples=100, n_features=10, n_classes=2, random_state=42 + ) + return pd.DataFrame(X), pd.Series(y) + + +def test_catboost_regressor_with_dataframe(regression_data): + X, y = regression_data + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(X, y) + predictions = model.predict(X) + return predictions + + +def test_catboost_regressor_with_numpy(regression_data): + X, y = regression_data + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(X.values, y.values) + predictions = model.predict(X.values) + return predictions + + +def test_catboost_classifier_with_dataframe(classification_data): + X, y = classification_data + model = CatBoostClassifier(iterations=10, verbose=0) + model.fit(X, y) + predictions = model.predict(X) + return predictions + + +def test_catboost_classifier_with_numpy(classification_data): + X, y = classification_data + model = CatBoostClassifier(iterations=10, verbose=0) + model.fit(X.values, y.values) + predictions = model.predict(X.values) + return predictions + + +def test_catboost_with_pool_and_dataframe(regression_data): + X, y = regression_data + train_pool = Pool(X, y) + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(train_pool) + predictions = model.predict(X) + return predictions + + +def test_catboost_with_pool_and_numpy(regression_data): + X, y = regression_data + train_pool = Pool(X.values, y.values) + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(train_pool) + predictions = model.predict(X.values) + return predictions + + +def test_catboost_with_categorical_features(): + data = { + "numerical_feature": rng.standard_normal(100), + "categorical_feature": rng.choice(["A", "B", "C"], size=100), + "target": rng.integers(0, 2, size=100), + } + df = pd.DataFrame(data) + X = df[["numerical_feature", "categorical_feature"]] + y = df["target"] + cat_features = ["categorical_feature"] + model = CatBoostClassifier( + iterations=10, verbose=0, cat_features=cat_features + ) + model.fit(X, y) + predictions = model.predict(X) + return predictions + + +@pytest.mark.parametrize( + "X, y", + [ + ( + pd.DataFrame(rng.standard_normal((100, 5))), + pd.Series(rng.standard_normal(100)), + ), + (rng.standard_normal((100, 5)), rng.standard_normal(100)), + ], +) +def test_catboost_train_test_split(X, y): + from sklearn.model_selection import train_test_split + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(X_train, y_train) + predictions = model.predict(X_test) + return len(X_train), len(X_test), len(y_train), len(y_test), predictions diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py index 892d0886596..27eaff87ba0 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py @@ -102,7 +102,7 @@ def test_random_forest(binary_classification_data): def test_clustering(): rng = np.random.default_rng(42) nsamps = 300 - X = rng.random((nsamps, 2)) + X = rng.random(size=(nsamps, 2)) data = pd.DataFrame(X, columns=["x", "y"]) kmeans = KMeans(n_clusters=3, random_state=42) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py index 37e3cc34856..0777d982ac2 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py @@ -31,17 +31,17 @@ def dask_client(): def test_1d_distributed(dask_client): - np.random.seed(42) - ts = pd.Series(np.random.rand(100)) + rng = np.random.default_rng(seed=42) + ts = pd.Series(rng.random(100)) m = 10 return stumpy.stumped(dask_client, ts, m) def test_multidimensional_distributed_timeseries(dask_client): - np.random.seed(42) + rng = np.random.default_rng(seed=42) # Each row represents data from a different dimension while each column represents # data from the same dimension - your_time_series = np.random.rand(3, 1000) + your_time_series = rng.random(3, 1000) # Approximately, how many data points might be found in a pattern window_size = 50 diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 442922eb895..280dd52bb22 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -20,20 +20,21 @@ requires-python = ">=3.10" dependencies = [ "cachetools", "cubinlinker", - "cuda-python>=11.7.1,<12.0a0", + "cuda-python>=11.7.1,<12.0a0,<=11.8.3", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "libcudf==24.10.*", - "numba>=0.57", + "libcudf==24.12.*,>=0.0.0a0", + "numba-cuda>=0.0.13,<0.0.18", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "ptxcompiler", - "pyarrow>=14.0.0,<18.0.0a0", - "pylibcudf==24.10.*", + "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'", + "pylibcudf==24.12.*,>=0.0.0a0", "rich", - "rmm==24.10.*", + "rmm==24.12.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -52,6 +53,7 @@ test = [ "cramjam", "fastavro>=0.22.9", "hypothesis", + "mmh3", "msgpack", "pytest-benchmark", "pytest-cases>=3.8.2", @@ -62,6 +64,7 @@ test = [ "tokenizers==0.15.2", "transformers==4.39.3", "tzdata", + "xxhash", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pandas-tests = [ "ipython", @@ -80,49 +83,36 @@ cudf-pandas-tests = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", ] -known_rapids = [ - "rmm", - "pylibcudf" -] -known_first_party = [ - "cudf", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + +[tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + "ignore:::.*xdist.*", + "ignore:::.*pytest.*", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", + # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() + "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore", + # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ + "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning", + # PerformanceWarning from cupy warming up the JIT cache + "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning", + # Ignore numba PEP 456 warning specific to arm machines + "ignore:FNV hashing is not implemented in Numba.*:UserWarning" ] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", +markers = [ + "spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`" ] +xfail_strict = true [tool.rapids-build-backend] build-backend = "scikit_build_core.build" @@ -131,11 +121,11 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", - "libcudf==24.10.*", - "librmm==24.10.*", + "libcudf==24.12.*,>=0.0.0a0", + "librmm==24.12.*,>=0.0.0a0", "ninja", - "pylibcudf==24.10.*", - "rmm==24.10.*", + "pylibcudf==24.12.*,>=0.0.0a0", + "rmm==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [tool.scikit-build] @@ -152,3 +142,18 @@ wheel.packages = ["cudf"] provider = "scikit_build_core.metadata.regex" input = "cudf/VERSION" regex = "(?P.*)" + +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "pylibcudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index f0e4ca6eb62..b2ea3f06e48 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cudf==24.10.*", + "cudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.optional-dependencies] @@ -32,56 +32,38 @@ test = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", - "streamz", -] -known_rapids = [ - "rmm", - "cudf", - "dask_cudf", -] -known_first_party = [ - "cudf_kafka", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf_kafka"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda", "streamz"] +rapids = ["rmm", "cudf", "dask_cudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] + +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", ] +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ - "error" + "error", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", ] +xfail_strict = true [tool.scikit-build] build-dir = "build/{wheel_tag}" diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 66c15f694ee..ba4858c5619 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -12,7 +12,7 @@ from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator # Check we have a supported polars version from cudf_polars.utils.versions import _ensure_polars_version @@ -22,7 +22,7 @@ __all__: list[str] = [ "execute_with_cudf", - "translate_ir", + "Translator", "__git_commit__", "__version__", ] diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 76816ee0a61..8dc5715195d 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -15,10 +15,11 @@ from polars.exceptions import ComputeError, PerformanceWarning +import pylibcudf import rmm from rmm._cuda import gpu -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator if TYPE_CHECKING: from collections.abc import Generator @@ -32,8 +33,26 @@ __all__: list[str] = ["execute_with_cudf"] +_SUPPORTED_PREFETCHES = { + "column_view::get_data", + "mutable_column_view::get_data", + "gather", + "hash_join", +} + + +def _env_get_int(name, default): + try: + return int(os.getenv(name, default)) + except (ValueError, TypeError): # pragma: no cover + return default # pragma: no cover + + @cache -def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: +def default_memory_resource( + device: int, + cuda_managed_memory: bool, # noqa: FBT001 +) -> rmm.mr.DeviceMemoryResource: """ Return the default memory resource for cudf-polars. @@ -42,15 +61,35 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: device Disambiguating device id when selecting the device. Must be the active device when this function is called. + cuda_managed_memory + Whether to use managed memory or not. Returns ------- rmm.mr.DeviceMemoryResource The default memory resource that cudf-polars uses. Currently - an async pool resource. + a managed memory resource, if `cuda_managed_memory` is `True`. + else, an async pool resource is returned. """ try: - return rmm.mr.CudaAsyncMemoryResource() + if ( + cuda_managed_memory + and pylibcudf.utils._is_concurrent_managed_access_supported() + ): + # Allocating 80% of the available memory for the pool. + # Leaving a 20% headroom to avoid OOM errors. + free_memory, _ = rmm.mr.available_device_memory() + free_memory = int(round(float(free_memory) * 0.80 / 256) * 256) + for key in _SUPPORTED_PREFETCHES: + pylibcudf.experimental.enable_prefetching(key) + mr = rmm.mr.PrefetchResourceAdaptor( + rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), + initial_pool_size=free_memory, + ) + ) + else: + mr = rmm.mr.CudaAsyncMemoryResource() except RuntimeError as e: # pragma: no cover msg, *_ = e.args if ( @@ -64,6 +103,8 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: ) from None else: raise + else: + return mr @contextlib.contextmanager @@ -89,10 +130,15 @@ def set_memory_resource( at entry. If a memory resource is provided, it must be valid to use with the currently active device. """ + previous = rmm.mr.get_current_device_resource() if mr is None: device: int = gpu.getDevice() - mr = default_memory_resource(device) - previous = rmm.mr.get_current_device_resource() + mr = default_memory_resource( + device=device, + cuda_managed_memory=bool( + _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0 + ), + ) rmm.mr.set_current_device_resource(mr) try: yield mr @@ -148,12 +194,30 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf( - nt: NodeTraverser, - *, - config: GPUEngine, - exception: type[Exception] | tuple[type[Exception], ...] = Exception, -) -> None: +def validate_config_options(config: dict) -> None: + """ + Validate the configuration options for the GPU engine. + + Parameters + ---------- + config + Configuration options to validate. + + Raises + ------ + ValueError + If the configuration contains unsupported options. + """ + if unsupported := (config.keys() - {"raise_on_fail", "parquet_options"}): + raise ValueError( + f"Engine configuration contains unsupported settings: {unsupported}" + ) + assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset( + config.get("parquet_options", {}) + ) + + +def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -165,35 +229,48 @@ def execute_with_cudf( config GPUEngine configuration object - exception - Optional exception, or tuple of exceptions, to catch during - translation. Defaults to ``Exception``. + Raises + ------ + ValueError + If the config contains unsupported keys. + NotImplementedError + If translation of the plan is unsupported. + Notes + ----- The NodeTraverser is mutated if the libcudf executor can handle the plan. """ device = config.device memory_resource = config.memory_resource raise_on_fail = config.config.get("raise_on_fail", False) - if unsupported := (config.config.keys() - {"raise_on_fail"}): - raise ValueError( - f"Engine configuration contains unsupported settings {unsupported}" - ) - try: - with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): + validate_config_options(config.config) + + with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): + translator = Translator(nt, config) + ir = translator.translate_ir() + ir_translation_errors = translator.errors + if len(ir_translation_errors): + # TODO: Display these errors in user-friendly way. + # tracked in https://github.com/rapidsai/cudf/issues/17051 + unique_errors = sorted(set(ir_translation_errors), key=str) + formatted_errors = "\n".join( + f"- {e.__class__.__name__}: {e}" for e in unique_errors + ) + error_message = ( + "Query execution with GPU not possible: unsupported operations." + f"\nThe errors were:\n{formatted_errors}" + ) + exception = NotImplementedError(error_message, unique_errors) + if bool(int(os.environ.get("POLARS_VERBOSE", 0))): + warnings.warn(error_message, PerformanceWarning, stacklevel=2) + if raise_on_fail: + raise exception + else: nt.set_udf( partial( _callback, - translate_ir(nt), + ir, device=device, memory_resource=memory_resource, ) ) - except exception as e: - if bool(int(os.environ.get("POLARS_VERBOSE", 0))): - warnings.warn( - f"Query execution with GPU not supported, reason: {type(e)}: {e}", - PerformanceWarning, - stacklevel=2, - ) - if raise_on_fail: - raise diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index 06bb08953f1..3b1eff4a0d0 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column", "NamedColumn"] +__all__: list[str] = ["DataFrame", "Column"] -from cudf_polars.containers.column import Column, NamedColumn +from cudf_polars.containers.column import Column from cudf_polars.containers.dataframe import DataFrame diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 3fe3e5557cb..93d95346a37 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -8,14 +8,25 @@ import functools from typing import TYPE_CHECKING +from polars.exceptions import InvalidOperationError + import pylibcudf as plc +from pylibcudf.strings.convert.convert_floats import from_floats, is_float, to_floats +from pylibcudf.strings.convert.convert_integers import ( + from_integers, + is_integer, + to_integers, +) +from pylibcudf.traits import is_floating_point + +from cudf_polars.utils.dtypes import is_order_preserving_cast if TYPE_CHECKING: from typing_extensions import Self import polars as pl -__all__: list[str] = ["Column", "NamedColumn"] +__all__: list[str] = ["Column"] class Column: @@ -26,6 +37,9 @@ class Column: order: plc.types.Order null_order: plc.types.NullOrder is_scalar: bool + # Optional name, only ever set by evaluation of NamedExpr nodes + # The internal evaluation should not care about the name. + name: str | None def __init__( self, @@ -34,14 +48,12 @@ def __init__( is_sorted: plc.types.Sorted = plc.types.Sorted.NO, order: plc.types.Order = plc.types.Order.ASCENDING, null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, + name: str | None = None, ): self.obj = column self.is_scalar = self.obj.size() == 1 - if self.obj.size() <= 1: - is_sorted = plc.types.Sorted.YES - self.is_sorted = is_sorted - self.order = order - self.null_order = null_order + self.name = name + self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order) @functools.cached_property def obj_scalar(self) -> plc.Scalar: @@ -63,9 +75,26 @@ def obj_scalar(self) -> plc.Scalar: ) return plc.copying.get_element(self.obj, 0) + def rename(self, name: str | None, /) -> Self: + """ + Return a shallow copy with a new name. + + Parameters + ---------- + name + New name + + Returns + ------- + Shallow copy of self with new name set. + """ + new = self.copy() + new.name = name + return new + def sorted_like(self, like: Column, /) -> Self: """ - Copy sortedness properties from a column onto self. + Return a shallow copy with sortedness from like. Parameters ---------- @@ -74,20 +103,23 @@ def sorted_like(self, like: Column, /) -> Self: Returns ------- - Self with metadata set. + Shallow copy of self with metadata set. See Also -------- set_sorted, copy_metadata """ - return self.set_sorted( - is_sorted=like.is_sorted, order=like.order, null_order=like.null_order + return type(self)( + self.obj, + name=self.name, + is_sorted=like.is_sorted, + order=like.order, + null_order=like.null_order, ) - # TODO: Return Column once #16272 is fixed. - def astype(self, dtype: plc.DataType) -> plc.Column: + def astype(self, dtype: plc.DataType) -> Column: """ - Return the backing column as the requested dtype. + Cast the column to as the requested dtype. Parameters ---------- @@ -108,9 +140,46 @@ def astype(self, dtype: plc.DataType) -> plc.Column: This only produces a copy if the requested dtype doesn't match the current one. """ - if self.obj.type() != dtype: - return plc.unary.cast(self.obj, dtype) - return self.obj + if self.obj.type() == dtype: + return self + + if dtype.id() == plc.TypeId.STRING or self.obj.type().id() == plc.TypeId.STRING: + return Column(self._handle_string_cast(dtype)) + else: + result = Column(plc.unary.cast(self.obj, dtype)) + if is_order_preserving_cast(self.obj.type(), dtype): + return result.sorted_like(self) + return result + + def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column: + if dtype.id() == plc.TypeId.STRING: + if is_floating_point(self.obj.type()): + return from_floats(self.obj) + else: + return from_integers(self.obj) + else: + if is_floating_point(dtype): + floats = is_float(self.obj) + if not plc.interop.to_arrow( + plc.reduce.reduce( + floats, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("Conversion from `str` failed.") + return to_floats(self.obj, dtype) + else: + integers = is_integer(self.obj) + if not plc.interop.to_arrow( + plc.reduce.reduce( + integers, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("Conversion from `str` failed.") + return to_integers(self.obj, dtype) def copy_metadata(self, from_: pl.Series, /) -> Self: """ @@ -129,6 +198,7 @@ def copy_metadata(self, from_: pl.Series, /) -> Self: -------- set_sorted, sorted_like """ + self.name = from_.name if len(from_) <= 1: return self ascending = from_.flags["SORTED_ASC"] @@ -192,6 +262,7 @@ def copy(self) -> Self: is_sorted=self.is_sorted, order=self.order, null_order=self.null_order, + name=self.name, ) def mask_nans(self) -> Self: @@ -217,58 +288,3 @@ def nan_count(self) -> int: ) ).as_py() return 0 - - -class NamedColumn(Column): - """A column with a name.""" - - name: str - - def __init__( - self, - column: plc.Column, - name: str, - *, - is_sorted: plc.types.Sorted = plc.types.Sorted.NO, - order: plc.types.Order = plc.types.Order.ASCENDING, - null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, - ) -> None: - super().__init__( - column, is_sorted=is_sorted, order=order, null_order=null_order - ) - self.name = name - - def copy(self, *, new_name: str | None = None) -> Self: - """ - A shallow copy of the column. - - Parameters - ---------- - new_name - Optional new name for the copied column. - - Returns - ------- - New column sharing data with self. - """ - return type(self)( - self.obj, - self.name if new_name is None else new_name, - is_sorted=self.is_sorted, - order=self.order, - null_order=self.null_order, - ) - - def mask_nans(self) -> Self: - """Return a shallow copy of self with nans masked out.""" - # Annoying, the inheritance is not right (can't call the - # super-type mask_nans), but will sort that by refactoring - # later. - if plc.traits.is_floating_point(self.obj.type()): - old_count = self.obj.null_count() - mask, new_count = plc.transform.nans_to_nulls(self.obj) - result = type(self)(self.obj.with_mask(mask, new_count), self.name) - if old_count == new_count: - return result.sorted_like(self) - return result - return self.copy() diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index f3e3862d0cc..36e0fbe370e 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,43 +5,52 @@ from __future__ import annotations -import itertools +import pickle from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, cast import pyarrow as pa -import pylibcudf as plc import polars as pl -from cudf_polars.containers.column import NamedColumn +import pylibcudf as plc + +from cudf_polars.containers import Column from cudf_polars.utils import dtypes if TYPE_CHECKING: - from collections.abc import Mapping, Sequence, Set + from collections.abc import Iterable, Mapping, Sequence, Set from typing_extensions import Self - from cudf_polars.containers import Column - __all__: list[str] = ["DataFrame"] +# Pacify the type checker. DataFrame init asserts that all the columns +# have a string name, so let's narrow the type. +class NamedColumn(Column): + name: str + + class DataFrame: """A representation of a dataframe.""" - columns: list[NamedColumn] + column_map: dict[str, Column] table: plc.Table + columns: list[NamedColumn] - def __init__(self, columns: Sequence[NamedColumn]) -> None: - self.columns = list(columns) - self._column_map = {c.name: c for c in self.columns} - self.table = plc.Table([c.obj for c in columns]) + def __init__(self, columns: Iterable[Column]) -> None: + columns = list(columns) + if any(c.name is None for c in columns): + raise ValueError("All columns must have a name") + self.columns = [cast(NamedColumn, c) for c in columns] + self.column_map = {c.name: c for c in self.columns} + self.table = plc.Table([c.obj for c in self.columns]) def copy(self) -> Self: """Return a shallow copy of self.""" - return type(self)([c.copy() for c in self.columns]) + return type(self)(c.copy() for c in self.columns) def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" @@ -51,42 +60,38 @@ def to_polars(self) -> pl.DataFrame: # https://github.com/pola-rs/polars/issues/11632 # To guarantee we produce correct names, we therefore # serialise with names we control and rename with that map. - name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)} - table: pa.Table = plc.interop.to_arrow( + name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)} + table = plc.interop.to_arrow( self.table, [plc.interop.ColumnMetadata(name=name) for name in name_map], ) df: pl.DataFrame = pl.from_arrow(table) return df.rename(name_map).with_columns( - *( - pl.col(c.name).set_sorted( - descending=c.order == plc.types.Order.DESCENDING - ) - if c.is_sorted - else pl.col(c.name) - for c in self.columns - ) + pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING) + if c.is_sorted + else pl.col(c.name) + for c in self.columns ) @cached_property def column_names_set(self) -> frozenset[str]: """Return the column names as a set.""" - return frozenset(c.name for c in self.columns) + return frozenset(self.column_map) @cached_property def column_names(self) -> list[str]: """Return a list of the column names.""" - return [c.name for c in self.columns] + return list(self.column_map) @cached_property def num_columns(self) -> int: """Number of columns.""" - return len(self.columns) + return len(self.column_map) @cached_property def num_rows(self) -> int: """Number of rows.""" - return 0 if len(self.columns) == 0 else self.table.num_rows() + return self.table.num_rows() if self.column_map else 0 @classmethod def from_polars(cls, df: pl.DataFrame) -> Self: @@ -111,12 +116,8 @@ def from_polars(cls, df: pl.DataFrame) -> Self: # No-op if the schema is unchanged. d_table = plc.interop.from_arrow(table.cast(schema)) return cls( - [ - NamedColumn(column, h_col.name).copy_metadata(h_col) - for column, h_col in zip( - d_table.columns(), df.iter_columns(), strict=True - ) - ] + Column(column).copy_metadata(h_col) + for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True) ) @classmethod @@ -144,17 +145,83 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") return cls( - [ - NamedColumn(c, name) - for c, name in zip(table.columns(), names, strict=True) - ] + Column(c, name=name) for c, name in zip(table.columns(), names, strict=True) ) + @classmethod + def deserialize( + cls, header: Mapping[str, Any], frames: tuple[memoryview, plc.gpumemoryview] + ) -> Self: + """ + Create a DataFrame from a serialized representation returned by `.serialize()`. + + Parameters + ---------- + header + The (unpickled) metadata required to reconstruct the object. + frames + Two-tuple of frames (a memoryview and a gpumemoryview). + + Returns + ------- + DataFrame + The deserialized DataFrame. + """ + packed_metadata, packed_gpu_data = frames + table = plc.contiguous_split.unpack_from_memoryviews( + packed_metadata, packed_gpu_data + ) + return cls( + Column(c, **kw) + for c, kw in zip(table.columns(), header["columns_kwargs"], strict=True) + ) + + def serialize( + self, + ) -> tuple[Mapping[str, Any], tuple[memoryview, plc.gpumemoryview]]: + """ + Serialize the table into header and frames. + + Follows the Dask serialization scheme with a picklable header (dict) and + a tuple of frames (in this case a contiguous host and device buffer). + + To enable dask support, dask serializers must be registered + + >>> from cudf_polars.experimental.dask_serialize import register + >>> register() + + Returns + ------- + header + A dict containing any picklable metadata required to reconstruct the object. + frames + Two-tuple of frames suitable for passing to `unpack_from_memoryviews` + """ + packed = plc.contiguous_split.pack(self.table) + + # Keyword arguments for `Column.__init__`. + columns_kwargs = [ + { + "is_sorted": col.is_sorted, + "order": col.order, + "null_order": col.null_order, + "name": col.name, + } + for col in self.columns + ] + header = { + "columns_kwargs": columns_kwargs, + # Dask Distributed uses "type-serialized" to dispatch deserialization + "type-serialized": pickle.dumps(type(self)), + "frame_count": 2, + } + return header, packed.release() + def sorted_like( self, like: DataFrame, /, *, subset: Set[str] | None = None ) -> Self: """ - Copy sortedness from a dataframe onto self. + Return a shallow copy with sortedness copied from like. Parameters ---------- @@ -165,7 +232,7 @@ def sorted_like( Returns ------- - Self with metadata set. + Shallow copy of self with metadata set. Raises ------ @@ -175,13 +242,12 @@ def sorted_like( if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset - self.columns = [ + return type(self)( c.sorted_like(other) if c.name in subset else c for c, other in zip(self.columns, like.columns, strict=True) - ] - return self + ) - def with_columns(self, columns: Sequence[NamedColumn]) -> Self: + def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self: """ Return a new dataframe with extra columns. @@ -189,6 +255,8 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ---------- columns Columns to add + replace_only + If true, then only replacements are allowed (matching by name). Returns ------- @@ -196,36 +264,30 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: Notes ----- - If column names overlap, newer names replace older ones. + If column names overlap, newer names replace older ones, and + appear in the same order as the original frame. """ - columns = list( - {c.name: c for c in itertools.chain(self.columns, columns)}.values() - ) - return type(self)(columns) + new = {c.name: c for c in columns} + if replace_only and not self.column_names_set.issuperset(new.keys()): + raise ValueError("Cannot replace with non-existing names") + return type(self)((self.column_map | new).values()) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" - return type(self)([c for c in self.columns if c.name not in names]) + return type(self)(column for column in self.columns if column.name not in names) def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" - want = set(names) - if not want.issubset(self.column_names_set): - raise ValueError("Can't select missing names") - return type(self)([self._column_map[name] for name in names]) - - def replace_columns(self, *columns: NamedColumn) -> Self: - """Return a new dataframe with columns replaced by name.""" - new = {c.name: c for c in columns} - if not set(new).issubset(self.column_names_set): - raise ValueError("Cannot replace with non-existing names") - return type(self)([new.get(c.name, c) for c in self.columns]) + try: + return type(self)(self.column_map[name] for name in names) + except KeyError as e: + raise ValueError("Can't select missing names") from e def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" - return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns]) + return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns) - def select_columns(self, names: Set[str]) -> list[NamedColumn]: + def select_columns(self, names: Set[str]) -> list[Column]: """Select columns by name.""" return [c for c in self.columns if c.name in names] diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c401e5a2f17..326d6b65cbe 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -15,34 +15,35 @@ from __future__ import annotations -import enum -from enum import IntEnum -from functools import partial, reduce -from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple - -import pyarrow as pa -import pyarrow.compute as pc -import pylibcudf as plc - -from polars.exceptions import InvalidOperationError -from polars.polars import _expr_nodes as pl_expr - -from cudf_polars.containers import Column, NamedColumn -from cudf_polars.utils import dtypes, sorting - -if TYPE_CHECKING: - from collections.abc import Mapping, Sequence - - import polars as pl - import polars.type_aliases as pl_types - - from cudf_polars.containers import DataFrame +from cudf_polars.dsl.expressions.aggregation import Agg +from cudf_polars.dsl.expressions.base import ( + AggInfo, + Col, + ColRef, + ErrorExpr, + Expr, + NamedExpr, +) +from cudf_polars.dsl.expressions.binaryop import BinOp +from cudf_polars.dsl.expressions.boolean import BooleanFunction +from cudf_polars.dsl.expressions.datetime import TemporalFunction +from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn +from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow +from cudf_polars.dsl.expressions.selection import Filter, Gather +from cudf_polars.dsl.expressions.sorting import Sort, SortBy +from cudf_polars.dsl.expressions.string import StringFunction +from cudf_polars.dsl.expressions.ternary import Ternary +from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction __all__ = [ "Expr", + "ErrorExpr", "NamedExpr", "Literal", + "LiteralColumn", + "Len", "Col", + "ColRef", "BooleanFunction", "StringFunction", "TemporalFunction", @@ -54,1782 +55,8 @@ "GroupedRollingWindow", "Cast", "Agg", + "AggInfo", "Ternary", "BinOp", + "UnaryFunction", ] - - -class ExecutionContext(IntEnum): - FRAME = enum.auto() - GROUPBY = enum.auto() - ROLLING = enum.auto() - - -class AggInfo(NamedTuple): - requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]] - - -class Expr: - """ - An abstract expression object. - - This contains a (potentially empty) tuple of child expressions, - along with non-child data. For uniform reconstruction and - implementation of hashing and equality schemes, child classes need - to provide a certain amount of metadata when they are defined. - Specifically, the ``_non_child`` attribute must list, in-order, - the names of the slots that are passed to the constructor. The - constructor must take arguments in the order ``(*_non_child, - *children).`` - """ - - __slots__ = ("dtype", "_hash_value", "_repr_value") - dtype: plc.DataType - """Data type of the expression.""" - _hash_value: int - """Caching slot for the hash of the expression.""" - _repr_value: str - """Caching slot for repr of the expression.""" - children: tuple[Expr, ...] = () - """Children of the expression.""" - _non_child: ClassVar[tuple[str, ...]] = ("dtype",) - """Names of non-child data (not Exprs) for reconstruction.""" - - # Constructor must take arguments in order (*_non_child, *children) - def __init__(self, dtype: plc.DataType) -> None: - self.dtype = dtype - - def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence: - return (*(getattr(self, attr) for attr in self._non_child), *children) - - def get_hash(self) -> int: - """ - Return the hash of this expr. - - Override this in subclasses, rather than __hash__. - - Returns - ------- - The integer hash value. - """ - return hash((type(self), self._ctor_arguments(self.children))) - - def __hash__(self) -> int: - """Hash of an expression with caching.""" - try: - return self._hash_value - except AttributeError: - self._hash_value = self.get_hash() - return self._hash_value - - def is_equal(self, other: Any) -> bool: - """ - Equality of two expressions. - - Override this in subclasses, rather than __eq__. - - Parameter - --------- - other - object to compare to - - Returns - ------- - True if the two expressions are equal, false otherwise. - """ - if type(self) is not type(other): - return False # pragma: no cover; __eq__ trips first - return self._ctor_arguments(self.children) == other._ctor_arguments( - other.children - ) - - def __eq__(self, other: Any) -> bool: - """Equality of expressions.""" - if type(self) is not type(other) or hash(self) != hash(other): - return False - else: - return self.is_equal(other) - - def __ne__(self, other: Any) -> bool: - """Inequality of expressions.""" - return not self.__eq__(other) - - def __repr__(self) -> str: - """String representation of an expression with caching.""" - try: - return self._repr_value - except AttributeError: - args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children)) - self._repr_value = f"{type(self).__name__}({args})" - return self._repr_value - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """ - Evaluate this expression given a dataframe for context. - - Parameters - ---------- - df - DataFrame that will provide columns. - context - What context are we performing this evaluation in? - mapping - Substitution mapping from expressions to Columns, used to - override the evaluation of a given expression if we're - performing a simple rewritten evaluation. - - Notes - ----- - Do not call this function directly, but rather - :meth:`evaluate` which handles the mapping lookups. - - Returns - ------- - Column representing the evaluation of the expression. - - Raises - ------ - NotImplementedError - If we couldn't evaluate the expression. Ideally all these - are returned during translation to the IR, but for now we - are not perfect. - """ - raise NotImplementedError( - f"Evaluation of expression {type(self).__name__}" - ) # pragma: no cover; translation of unimplemented nodes trips first - - def evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """ - Evaluate this expression given a dataframe for context. - - Parameters - ---------- - df - DataFrame that will provide columns. - context - What context are we performing this evaluation in? - mapping - Substitution mapping from expressions to Columns, used to - override the evaluation of a given expression if we're - performing a simple rewritten evaluation. - - Notes - ----- - Individual subclasses should implement :meth:`do_evaluate`, - this method provides logic to handle lookups in the - substitution mapping. - - Returns - ------- - Column representing the evaluation of the expression. - - Raises - ------ - NotImplementedError - If we couldn't evaluate the expression. Ideally all these - are returned during translation to the IR, but for now we - are not perfect. - """ - if mapping is None: - return self.do_evaluate(df, context=context, mapping=mapping) - try: - return mapping[self] - except KeyError: - return self.do_evaluate(df, context=context, mapping=mapping) - - def collect_agg(self, *, depth: int) -> AggInfo: - """ - Collect information about aggregations in groupbys. - - Parameters - ---------- - depth - The depth of aggregating (reduction or sampling) - expressions we are currently at. - - Returns - ------- - Aggregation info describing the expression to aggregate in the - groupby. - - Raises - ------ - NotImplementedError - If we can't currently perform the aggregation request, for - example nested aggregations like ``a.max().min()``. - """ - raise NotImplementedError( - f"Collecting aggregation info for {type(self).__name__}" - ) # pragma: no cover; check_agg trips first - - -class NamedExpr: - # NamedExpr does not inherit from Expr since it does not appear - # when evaluating expressions themselves, only when constructing - # named return values in dataframe (IR) nodes. - __slots__ = ("name", "value") - value: Expr - name: str - - def __init__(self, name: str, value: Expr) -> None: - self.name = name - self.value = value - - def __hash__(self) -> int: - """Hash of the expression.""" - return hash((type(self), self.name, self.value)) - - def __repr__(self) -> str: - """Repr of the expression.""" - return f"NamedExpr({self.name}, {self.value})" - - def __eq__(self, other: Any) -> bool: - """Equality of two expressions.""" - return ( - type(self) is type(other) - and self.name == other.name - and self.value == other.value - ) - - def __ne__(self, other: Any) -> bool: - """Inequality of expressions.""" - return not self.__eq__(other) - - def evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> NamedColumn: - """ - Evaluate this expression given a dataframe for context. - - Parameters - ---------- - df - DataFrame providing context - context - Execution context - mapping - Substitution mapping - - Returns - ------- - NamedColumn attaching a name to an evaluated Column - - See Also - -------- - :meth:`Expr.evaluate` for details, this function just adds the - name to a column produced from an expression. - """ - obj = self.value.evaluate(df, context=context, mapping=mapping) - return NamedColumn( - obj.obj, - self.name, - is_sorted=obj.is_sorted, - order=obj.order, - null_order=obj.null_order, - ) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - return self.value.collect_agg(depth=depth) - - -class Literal(Expr): - __slots__ = ("value",) - _non_child = ("dtype", "value") - value: pa.Scalar[Any] - children: tuple[()] - - def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None: - super().__init__(dtype) - assert value.type == plc.interop.to_arrow(dtype) - self.value = value - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - # datatype of pyarrow scalar is correct by construction. - return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1)) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - return AggInfo([]) - - -class LiteralColumn(Expr): - __slots__ = ("value",) - _non_child = ("dtype", "value") - value: pa.Array[Any, Any] - children: tuple[()] - - def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: - super().__init__(dtype) - data = value.to_arrow() - self.value = data.cast(dtypes.downcast_arrow_lists(data.type)) - - def get_hash(self) -> int: - """Compute a hash of the column.""" - # This is stricter than necessary, but we only need this hash - # for identity in groupby replacements so it's OK. And this - # way we avoid doing potentially expensive compute. - return hash((type(self), self.dtype, id(self.value))) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - # datatype of pyarrow array is correct by construction. - return Column(plc.interop.from_arrow(self.value)) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - return AggInfo([]) - - -class Col(Expr): - __slots__ = ("name",) - _non_child = ("dtype", "name") - name: str - children: tuple[()] - - def __init__(self, dtype: plc.DataType, name: str) -> None: - self.dtype = dtype - self.name = name - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - return df._column_map[self.name] - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - return AggInfo([(self, plc.aggregation.collect_list(), self)]) - - -class Len(Expr): - children: tuple[()] - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - return Column( - plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype)) - ), - 1, - ) - ) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - # TODO: polars returns a uint, not an int for count - return AggInfo( - [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)] - ) - - -class BooleanFunction(Expr): - __slots__ = ("name", "options", "children") - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - def __init__( - self, - dtype: plc.DataType, - name: pl_expr.BooleanFunction, - options: tuple[Any, ...], - *children: Expr, - ) -> None: - super().__init__(dtype) - self.options = options - self.name = name - self.children = children - if self.name == pl_expr.BooleanFunction.IsIn and not all( - c.dtype == self.children[0].dtype for c in self.children - ): - # TODO: If polars IR doesn't put the casts in, we need to - # mimic the supertype promotion rules. - raise NotImplementedError("IsIn doesn't support supertype casting") - - @staticmethod - def _distinct( - column: Column, - *, - keep: plc.stream_compaction.DuplicateKeepOption, - source_value: plc.Scalar, - target_value: plc.Scalar, - ) -> Column: - table = plc.Table([column.obj]) - indices = plc.stream_compaction.distinct_indices( - table, - keep, - # TODO: polars doesn't expose options for these - plc.types.NullEquality.EQUAL, - plc.types.NanEquality.ALL_EQUAL, - ) - return Column( - plc.copying.scatter( - [source_value], - indices, - plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]), - ).columns()[0] - ) - - _BETWEEN_OPS: ClassVar[ - dict[ - pl_types.ClosedInterval, - tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator], - ] - ] = { - "none": ( - plc.binaryop.BinaryOperator.GREATER, - plc.binaryop.BinaryOperator.LESS, - ), - "left": ( - plc.binaryop.BinaryOperator.GREATER_EQUAL, - plc.binaryop.BinaryOperator.LESS, - ), - "right": ( - plc.binaryop.BinaryOperator.GREATER, - plc.binaryop.BinaryOperator.LESS_EQUAL, - ), - "both": ( - plc.binaryop.BinaryOperator.GREATER_EQUAL, - plc.binaryop.BinaryOperator.LESS_EQUAL, - ), - } - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - if self.name in ( - pl_expr.BooleanFunction.IsFinite, - pl_expr.BooleanFunction.IsInfinite, - ): - # Avoid evaluating the child if the dtype tells us it's unnecessary. - (child,) = self.children - is_finite = self.name == pl_expr.BooleanFunction.IsFinite - if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): - value = plc.interop.from_arrow( - pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype)) - ) - return Column(plc.Column.from_scalar(value, df.num_rows)) - needles = child.evaluate(df, context=context, mapping=mapping) - to_search = [-float("inf"), float("inf")] - if is_finite: - # NaN is neither finite not infinite - to_search.append(float("nan")) - haystack = plc.interop.from_arrow( - pa.array( - to_search, - type=plc.interop.to_arrow(needles.obj.type()), - ) - ) - result = plc.search.contains(haystack, needles.obj) - if is_finite: - result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT) - return Column(result) - columns = [ - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ] - # Kleene logic for Any (OR) and All (AND) if ignore_nulls is - # False - if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): - (ignore_nulls,) = self.options - (column,) = columns - is_any = self.name == pl_expr.BooleanFunction.Any - agg = plc.aggregation.any() if is_any else plc.aggregation.all() - result = plc.reduce.reduce(column.obj, agg, self.dtype) - if not ignore_nulls and column.obj.null_count() > 0: - # Truth tables - # Any All - # | F U T | F U T - # --+------ --+------ - # F | F U T F | F F F - # U | U U T U | F U U - # T | T T T T | F U T - # - # If the input null count was non-zero, we must - # post-process the result to insert the correct value. - h_result = plc.interop.to_arrow(result).as_py() - if is_any and not h_result or not is_any and h_result: - # Any All - # False || Null => Null True && Null => Null - return Column(plc.Column.all_null_like(column.obj, 1)) - return Column(plc.Column.from_scalar(result, 1)) - if self.name == pl_expr.BooleanFunction.IsNull: - (column,) = columns - return Column(plc.unary.is_null(column.obj)) - elif self.name == pl_expr.BooleanFunction.IsNotNull: - (column,) = columns - return Column(plc.unary.is_valid(column.obj)) - elif self.name == pl_expr.BooleanFunction.IsNan: - (column,) = columns - return Column( - plc.unary.is_nan(column.obj).with_mask( - column.obj.null_mask(), column.obj.null_count() - ) - ) - elif self.name == pl_expr.BooleanFunction.IsNotNan: - (column,) = columns - return Column( - plc.unary.is_not_nan(column.obj).with_mask( - column.obj.null_mask(), column.obj.null_count() - ) - ) - elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: - (column,) = columns - return self._distinct( - column, - keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, - source_value=plc.interop.from_arrow( - pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) - ), - target_value=plc.interop.from_arrow( - pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) - ), - ) - elif self.name == pl_expr.BooleanFunction.IsLastDistinct: - (column,) = columns - return self._distinct( - column, - keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, - source_value=plc.interop.from_arrow( - pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) - ), - target_value=plc.interop.from_arrow( - pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) - ), - ) - elif self.name == pl_expr.BooleanFunction.IsUnique: - (column,) = columns - return self._distinct( - column, - keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow( - pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) - ), - target_value=plc.interop.from_arrow( - pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) - ), - ) - elif self.name == pl_expr.BooleanFunction.IsDuplicated: - (column,) = columns - return self._distinct( - column, - keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow( - pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) - ), - target_value=plc.interop.from_arrow( - pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) - ), - ) - elif self.name == pl_expr.BooleanFunction.AllHorizontal: - return Column( - reduce( - partial( - plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, - output_type=self.dtype, - ), - (c.obj for c in columns), - ) - ) - elif self.name == pl_expr.BooleanFunction.AnyHorizontal: - return Column( - reduce( - partial( - plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, - output_type=self.dtype, - ), - (c.obj for c in columns), - ) - ) - elif self.name == pl_expr.BooleanFunction.IsIn: - needles, haystack = columns - return Column(plc.search.contains(haystack.obj, needles.obj)) - elif self.name == pl_expr.BooleanFunction.Not: - (column,) = columns - return Column( - plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT) - ) - else: - raise NotImplementedError( - f"BooleanFunction {self.name}" - ) # pragma: no cover; handled by init raising - - -class StringFunction(Expr): - __slots__ = ("name", "options", "children", "_regex_program") - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - def __init__( - self, - dtype: plc.DataType, - name: pl_expr.StringFunction, - options: tuple[Any, ...], - *children: Expr, - ) -> None: - super().__init__(dtype) - self.options = options - self.name = name - self.children = children - self._validate_input() - - def _validate_input(self): - if self.name not in ( - pl_expr.StringFunction.Contains, - pl_expr.StringFunction.EndsWith, - pl_expr.StringFunction.Lowercase, - pl_expr.StringFunction.Replace, - pl_expr.StringFunction.ReplaceMany, - pl_expr.StringFunction.Slice, - pl_expr.StringFunction.Strptime, - pl_expr.StringFunction.StartsWith, - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, - pl_expr.StringFunction.Uppercase, - ): - raise NotImplementedError(f"String function {self.name}") - if self.name == pl_expr.StringFunction.Contains: - literal, strict = self.options - if not literal: - if not strict: - raise NotImplementedError( - "f{strict=} is not supported for regex contains" - ) - if not isinstance(self.children[1], Literal): - raise NotImplementedError( - "Regex contains only supports a scalar pattern" - ) - pattern = self.children[1].value.as_py() - try: - self._regex_program = plc.strings.regex_program.RegexProgram.create( - pattern, - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, - ) - except RuntimeError as e: - raise NotImplementedError( - f"Unsupported regex {pattern} for GPU engine." - ) from e - elif self.name == pl_expr.StringFunction.Replace: - _, literal = self.options - if not literal: - raise NotImplementedError("literal=False is not supported for replace") - if not all(isinstance(expr, Literal) for expr in self.children[1:]): - raise NotImplementedError("replace only supports scalar target") - target = self.children[1] - if target.value == pa.scalar("", type=pa.string()): - raise NotImplementedError( - "libcudf replace does not support empty strings" - ) - elif self.name == pl_expr.StringFunction.ReplaceMany: - (ascii_case_insensitive,) = self.options - if ascii_case_insensitive: - raise NotImplementedError( - "ascii_case_insensitive not implemented for replace_many" - ) - if not all( - isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:] - ): - raise NotImplementedError("replace_many only supports literal inputs") - target = self.children[1] - if pc.any(pc.equal(target.value, "")).as_py(): - raise NotImplementedError( - "libcudf replace_many is implemented differently from polars " - "for empty strings" - ) - elif self.name == pl_expr.StringFunction.Slice: - if not all(isinstance(child, Literal) for child in self.children[1:]): - raise NotImplementedError( - "Slice only supports literal start and stop values" - ) - elif self.name == pl_expr.StringFunction.Strptime: - format, _, exact, cache = self.options - if cache: - raise NotImplementedError("Strptime cache is a CPU feature") - if format is None: - raise NotImplementedError("Strptime format is required") - if not exact: - raise NotImplementedError("Strptime does not support exact=False") - elif self.name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, - }: - if not isinstance(self.children[1], Literal): - raise NotImplementedError( - "strip operations only support scalar patterns" - ) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - if self.name == pl_expr.StringFunction.Contains: - child, arg = self.children - column = child.evaluate(df, context=context, mapping=mapping) - - literal, _ = self.options - if literal: - pat = arg.evaluate(df, context=context, mapping=mapping) - pattern = ( - pat.obj_scalar - if pat.is_scalar and pat.obj.size() != column.obj.size() - else pat.obj - ) - return Column(plc.strings.find.contains(column.obj, pattern)) - else: - return Column( - plc.strings.contains.contains_re(column.obj, self._regex_program) - ) - elif self.name == pl_expr.StringFunction.Slice: - child, expr_offset, expr_length = self.children - assert isinstance(expr_offset, Literal) - assert isinstance(expr_length, Literal) - - column = child.evaluate(df, context=context, mapping=mapping) - # libcudf slices via [start,stop). - # polars slices with offset + length where start == offset - # stop = start + length. Negative values for start look backward - # from the last element of the string. If the end index would be - # below zero, an empty string is returned. - # Do this maths on the host - start = expr_offset.value.as_py() - length = expr_length.value.as_py() - - if length == 0: - stop = start - else: - # No length indicates a scan to the end - # The libcudf equivalent is a null stop - stop = start + length if length else None - if length and start < 0 and length >= -start: - stop = None - return Column( - plc.strings.slice.slice_strings( - column.obj, - plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), - plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), - ) - ) - elif self.name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, - }: - column, chars = ( - c.evaluate(df, context=context, mapping=mapping) for c in self.children - ) - if self.name == pl_expr.StringFunction.StripCharsStart: - side = plc.strings.SideType.LEFT - elif self.name == pl_expr.StringFunction.StripCharsEnd: - side = plc.strings.SideType.RIGHT - else: - side = plc.strings.SideType.BOTH - return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar)) - - columns = [ - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ] - if self.name == pl_expr.StringFunction.Lowercase: - (column,) = columns - return Column(plc.strings.case.to_lower(column.obj)) - elif self.name == pl_expr.StringFunction.Uppercase: - (column,) = columns - return Column(plc.strings.case.to_upper(column.obj)) - elif self.name == pl_expr.StringFunction.EndsWith: - column, suffix = columns - return Column( - plc.strings.find.ends_with( - column.obj, - suffix.obj_scalar - if column.obj.size() != suffix.obj.size() and suffix.is_scalar - else suffix.obj, - ) - ) - elif self.name == pl_expr.StringFunction.StartsWith: - column, prefix = columns - return Column( - plc.strings.find.starts_with( - column.obj, - prefix.obj_scalar - if column.obj.size() != prefix.obj.size() and prefix.is_scalar - else prefix.obj, - ) - ) - elif self.name == pl_expr.StringFunction.Strptime: - # TODO: ignores ambiguous - format, strict, exact, cache = self.options - col = self.children[0].evaluate(df, context=context, mapping=mapping) - - is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( - col.obj, format.encode() - ) - - if strict: - if not plc.interop.to_arrow( - plc.reduce.reduce( - is_timestamps, - plc.aggregation.all(), - plc.DataType(plc.TypeId.BOOL8), - ) - ).as_py(): - raise InvalidOperationError("conversion from `str` failed.") - else: - not_timestamps = plc.unary.unary_operation( - is_timestamps, plc.unary.UnaryOperator.NOT - ) - - null = plc.interop.from_arrow(pa.scalar(None, type=pa.string())) - res = plc.copying.boolean_mask_scatter( - [null], plc.Table([col.obj]), not_timestamps - ) - return Column( - plc.strings.convert.convert_datetime.to_timestamps( - res.columns()[0], self.dtype, format.encode() - ) - ) - elif self.name == pl_expr.StringFunction.Replace: - column, target, repl = columns - n, _ = self.options - return Column( - plc.strings.replace.replace( - column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n - ) - ) - elif self.name == pl_expr.StringFunction.ReplaceMany: - column, target, repl = columns - return Column( - plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) - ) - raise NotImplementedError( - f"StringFunction {self.name}" - ) # pragma: no cover; handled by init raising - - -class TemporalFunction(Expr): - __slots__ = ("name", "options", "children") - _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { - pl_expr.TemporalFunction.Year: "year", - pl_expr.TemporalFunction.Month: "month", - pl_expr.TemporalFunction.Day: "day", - pl_expr.TemporalFunction.WeekDay: "weekday", - pl_expr.TemporalFunction.Hour: "hour", - pl_expr.TemporalFunction.Minute: "minute", - pl_expr.TemporalFunction.Second: "second", - pl_expr.TemporalFunction.Millisecond: "millisecond", - pl_expr.TemporalFunction.Microsecond: "microsecond", - pl_expr.TemporalFunction.Nanosecond: "nanosecond", - } - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - def __init__( - self, - dtype: plc.DataType, - name: pl_expr.TemporalFunction, - options: tuple[Any, ...], - *children: Expr, - ) -> None: - super().__init__(dtype) - self.options = options - self.name = name - self.children = children - if self.name not in self._COMPONENT_MAP: - raise NotImplementedError(f"Temporal function {self.name}") - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - columns = [ - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ] - (column,) = columns - if self.name == pl_expr.TemporalFunction.Microsecond: - millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") - micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") - millis_as_micros = plc.binaryop.binary_operation( - millis, - plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), - plc.binaryop.BinaryOperator.MUL, - plc.DataType(plc.TypeId.INT32), - ) - total_micros = plc.binaryop.binary_operation( - micros, - millis_as_micros, - plc.binaryop.BinaryOperator.ADD, - plc.types.DataType(plc.types.TypeId.INT32), - ) - return Column(total_micros) - elif self.name == pl_expr.TemporalFunction.Nanosecond: - millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") - micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") - nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond") - millis_as_nanos = plc.binaryop.binary_operation( - millis, - plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), - plc.binaryop.BinaryOperator.MUL, - plc.types.DataType(plc.types.TypeId.INT32), - ) - micros_as_nanos = plc.binaryop.binary_operation( - micros, - plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), - plc.binaryop.BinaryOperator.MUL, - plc.types.DataType(plc.types.TypeId.INT32), - ) - total_nanos = plc.binaryop.binary_operation( - nanos, - millis_as_nanos, - plc.binaryop.BinaryOperator.ADD, - plc.types.DataType(plc.types.TypeId.INT32), - ) - total_nanos = plc.binaryop.binary_operation( - total_nanos, - micros_as_nanos, - plc.binaryop.BinaryOperator.ADD, - plc.types.DataType(plc.types.TypeId.INT32), - ) - return Column(total_nanos) - - return Column( - plc.datetime.extract_datetime_component( - column.obj, - self._COMPONENT_MAP[self.name], - ) - ) - - -class UnaryFunction(Expr): - __slots__ = ("name", "options", "children") - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - # Note: log, and pow are handled via translation to binops - _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = { - "sin": plc.unary.UnaryOperator.SIN, - "cos": plc.unary.UnaryOperator.COS, - "tan": plc.unary.UnaryOperator.TAN, - "arcsin": plc.unary.UnaryOperator.ARCSIN, - "arccos": plc.unary.UnaryOperator.ARCCOS, - "arctan": plc.unary.UnaryOperator.ARCTAN, - "sinh": plc.unary.UnaryOperator.SINH, - "cosh": plc.unary.UnaryOperator.COSH, - "tanh": plc.unary.UnaryOperator.TANH, - "arcsinh": plc.unary.UnaryOperator.ARCSINH, - "arccosh": plc.unary.UnaryOperator.ARCCOSH, - "arctanh": plc.unary.UnaryOperator.ARCTANH, - "exp": plc.unary.UnaryOperator.EXP, - "sqrt": plc.unary.UnaryOperator.SQRT, - "cbrt": plc.unary.UnaryOperator.CBRT, - "ceil": plc.unary.UnaryOperator.CEIL, - "floor": plc.unary.UnaryOperator.FLOOR, - "abs": plc.unary.UnaryOperator.ABS, - "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, - "not": plc.unary.UnaryOperator.NOT, - } - _supported_misc_fns = frozenset( - { - "drop_nulls", - "fill_null", - "mask_nans", - "round", - "set_sorted", - "unique", - } - ) - _supported_cum_aggs = frozenset( - { - "cum_min", - "cum_max", - "cum_prod", - "cum_sum", - } - ) - _supported_fns = frozenset().union( - _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys() - ) - - def __init__( - self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr - ) -> None: - super().__init__(dtype) - self.name = name - self.options = options - self.children = children - - if self.name not in UnaryFunction._supported_fns: - raise NotImplementedError(f"Unary function {name=}") - if self.name in UnaryFunction._supported_cum_aggs: - (reverse,) = self.options - if reverse: - raise NotImplementedError( - "reverse=True is not supported for cumulative aggregations" - ) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - if self.name == "mask_nans": - (child,) = self.children - return child.evaluate(df, context=context, mapping=mapping).mask_nans() - if self.name == "round": - (decimal_places,) = self.options - (values,) = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - return Column( - plc.round.round( - values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP - ) - ).sorted_like(values) - elif self.name == "unique": - (maintain_order,) = self.options - (values,) = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - # Only one column, so keep_any is the same as keep_first - # for stable distinct - keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY - if values.is_sorted: - maintain_order = True - result = plc.stream_compaction.unique( - plc.Table([values.obj]), - [0], - keep, - plc.types.NullEquality.EQUAL, - ) - else: - distinct = ( - plc.stream_compaction.stable_distinct - if maintain_order - else plc.stream_compaction.distinct - ) - result = distinct( - plc.Table([values.obj]), - [0], - keep, - plc.types.NullEquality.EQUAL, - plc.types.NanEquality.ALL_EQUAL, - ) - (column,) = result.columns() - if maintain_order: - return Column(column).sorted_like(values) - return Column(column) - elif self.name == "set_sorted": - (column,) = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - (asc,) = self.options - order = ( - plc.types.Order.ASCENDING - if asc == "ascending" - else plc.types.Order.DESCENDING - ) - null_order = plc.types.NullOrder.BEFORE - if column.obj.null_count() > 0 and (n := column.obj.size()) > 1: - # PERF: This invokes four stream synchronisations! - has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid() - has_nulls_last = not plc.copying.get_element( - column.obj, n - 1 - ).is_valid() - if (order == plc.types.Order.DESCENDING and has_nulls_first) or ( - order == plc.types.Order.ASCENDING and has_nulls_last - ): - null_order = plc.types.NullOrder.AFTER - return column.set_sorted( - is_sorted=plc.types.Sorted.YES, - order=order, - null_order=null_order, - ) - elif self.name == "drop_nulls": - (column,) = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - return Column( - plc.stream_compaction.drop_nulls( - plc.Table([column.obj]), [0], 1 - ).columns()[0] - ) - elif self.name == "fill_null": - column = self.children[0].evaluate(df, context=context, mapping=mapping) - if isinstance(self.children[1], Literal): - arg = plc.interop.from_arrow(self.children[1].value) - else: - evaluated = self.children[1].evaluate( - df, context=context, mapping=mapping - ) - arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj - return Column(plc.replace.replace_nulls(column.obj, arg)) - elif self.name in self._OP_MAPPING: - column = self.children[0].evaluate(df, context=context, mapping=mapping) - if column.obj.type().id() != self.dtype.id(): - arg = plc.unary.cast(column.obj, self.dtype) - else: - arg = column.obj - return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name])) - elif self.name in UnaryFunction._supported_cum_aggs: - column = self.children[0].evaluate(df, context=context, mapping=mapping) - plc_col = column.obj - col_type = column.obj.type() - # cum_sum casts - # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention - # Bool -> UInt32 - # cum_prod casts integer dtypes < int64 and bool to int64 - # See: - # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs - if ( - self.name == "cum_sum" - and col_type.id() - in { - plc.types.TypeId.INT8, - plc.types.TypeId.UINT8, - plc.types.TypeId.INT16, - plc.types.TypeId.UINT16, - } - ) or ( - self.name == "cum_prod" - and plc.traits.is_integral(col_type) - and plc.types.size_of(col_type) <= 4 - ): - plc_col = plc.unary.cast( - plc_col, plc.types.DataType(plc.types.TypeId.INT64) - ) - elif ( - self.name == "cum_sum" - and column.obj.type().id() == plc.types.TypeId.BOOL8 - ): - plc_col = plc.unary.cast( - plc_col, plc.types.DataType(plc.types.TypeId.UINT32) - ) - if self.name == "cum_sum": - agg = plc.aggregation.sum() - elif self.name == "cum_prod": - agg = plc.aggregation.product() - elif self.name == "cum_min": - agg = plc.aggregation.min() - elif self.name == "cum_max": - agg = plc.aggregation.max() - - return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE)) - raise NotImplementedError( - f"Unimplemented unary function {self.name=}" - ) # pragma: no cover; init trips first - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs: - raise NotImplementedError(f"{self.name} in groupby") - if depth == 1: - # inside aggregation, need to pre-evaluate, groupby - # construction has checked that we don't have nested aggs, - # so stop the recursion and return ourselves for pre-eval - return AggInfo([(self, plc.aggregation.collect_list(), self)]) - else: - (child,) = self.children - return child.collect_agg(depth=depth) - - -class Sort(Expr): - __slots__ = ("options", "children") - _non_child = ("dtype", "options") - children: tuple[Expr] - - def __init__( - self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr - ) -> None: - super().__init__(dtype) - self.options = options - self.children = (column,) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - (child,) = self.children - column = child.evaluate(df, context=context, mapping=mapping) - (stable, nulls_last, descending) = self.options - order, null_order = sorting.sort_order( - [descending], nulls_last=[nulls_last], num_keys=1 - ) - do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort - table = do_sort(plc.Table([column.obj]), order, null_order) - return Column( - table.columns()[0], - is_sorted=plc.types.Sorted.YES, - order=order[0], - null_order=null_order[0], - ) - - -class SortBy(Expr): - __slots__ = ("options", "children") - _non_child = ("dtype", "options") - children: tuple[Expr, ...] - - def __init__( - self, - dtype: plc.DataType, - options: tuple[bool, tuple[bool], tuple[bool]], - column: Expr, - *by: Expr, - ) -> None: - super().__init__(dtype) - self.options = options - self.children = (column, *by) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - column, *by = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - (stable, nulls_last, descending) = self.options - order, null_order = sorting.sort_order( - descending, nulls_last=nulls_last, num_keys=len(by) - ) - do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key - table = do_sort( - plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order - ) - return Column(table.columns()[0]) - - -class Gather(Expr): - __slots__ = ("children",) - _non_child = ("dtype",) - children: tuple[Expr, Expr] - - def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None: - super().__init__(dtype) - self.children = (values, indices) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - values, indices = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - lo, hi = plc.reduce.minmax(indices.obj) - lo = plc.interop.to_arrow(lo).as_py() - hi = plc.interop.to_arrow(hi).as_py() - n = df.num_rows - if hi >= n or lo < -n: - raise ValueError("gather indices are out of bounds") - if indices.obj.null_count(): - bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY - obj = plc.replace.replace_nulls( - indices.obj, - plc.interop.from_arrow( - pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type())) - ), - ) - else: - bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK - obj = indices.obj - table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) - return Column(table.columns()[0]) - - -class Filter(Expr): - __slots__ = ("children",) - _non_child = ("dtype",) - children: tuple[Expr, Expr] - - def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): - super().__init__(dtype) - self.children = (values, indices) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - values, mask = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - table = plc.stream_compaction.apply_boolean_mask( - plc.Table([values.obj]), mask.obj - ) - return Column(table.columns()[0]).sorted_like(values) - - -class RollingWindow(Expr): - __slots__ = ("options", "children") - _non_child = ("dtype", "options") - children: tuple[Expr] - - def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None: - super().__init__(dtype) - self.options = options - self.children = (agg,) - raise NotImplementedError("Rolling window not implemented") - - -class GroupedRollingWindow(Expr): - __slots__ = ("options", "children") - _non_child = ("dtype", "options") - children: tuple[Expr, ...] - - def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None: - super().__init__(dtype) - self.options = options - self.children = (agg, *by) - raise NotImplementedError("Grouped rolling window not implemented") - - -class Cast(Expr): - __slots__ = ("children",) - _non_child = ("dtype",) - children: tuple[Expr] - - def __init__(self, dtype: plc.DataType, value: Expr) -> None: - super().__init__(dtype) - self.children = (value,) - if not dtypes.can_cast(value.dtype, self.dtype): - raise NotImplementedError( - f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" - ) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - (child,) = self.children - column = child.evaluate(df, context=context, mapping=mapping) - return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - # TODO: Could do with sort-based groupby and segmented filter - (child,) = self.children - return child.collect_agg(depth=depth) - - -class Agg(Expr): - __slots__ = ("name", "options", "op", "request", "children") - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - def __init__( - self, dtype: plc.DataType, name: str, options: Any, *children: Expr - ) -> None: - super().__init__(dtype) - self.name = name - self.options = options - self.children = children - if name not in Agg._SUPPORTED: - raise NotImplementedError( - f"Unsupported aggregation {name=}" - ) # pragma: no cover; all valid aggs are supported - # TODO: nan handling in groupby case - if name == "min": - req = plc.aggregation.min() - elif name == "max": - req = plc.aggregation.max() - elif name == "median": - req = plc.aggregation.median() - elif name == "n_unique": - # TODO: datatype of result - req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) - elif name == "first" or name == "last": - req = None - elif name == "mean": - req = plc.aggregation.mean() - elif name == "sum": - req = plc.aggregation.sum() - elif name == "std": - # TODO: handle nans - req = plc.aggregation.std(ddof=options) - elif name == "var": - # TODO: handle nans - req = plc.aggregation.variance(ddof=options) - elif name == "count": - req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) - elif name == "quantile": - _, quantile = self.children - if not isinstance(quantile, Literal): - raise NotImplementedError("Only support literal quantile values") - req = plc.aggregation.quantile( - quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options] - ) - else: - raise NotImplementedError( - f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" - ) # pragma: no cover - self.request = req - op = getattr(self, f"_{name}", None) - if op is None: - op = partial(self._reduce, request=req) - elif name in {"min", "max"}: - op = partial(op, propagate_nans=options) - elif name in {"count", "first", "last"}: - pass - else: - raise NotImplementedError( - f"Unreachable, supported agg {name=} has no implementation" - ) # pragma: no cover - self.op = op - - _SUPPORTED: ClassVar[frozenset[str]] = frozenset( - [ - "min", - "max", - "median", - "n_unique", - "first", - "last", - "mean", - "sum", - "count", - "std", - "var", - "quantile", - ] - ) - - interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = { - "nearest": plc.types.Interpolation.NEAREST, - "higher": plc.types.Interpolation.HIGHER, - "lower": plc.types.Interpolation.LOWER, - "midpoint": plc.types.Interpolation.MIDPOINT, - "linear": plc.types.Interpolation.LINEAR, - } - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - if depth >= 1: - raise NotImplementedError( - "Nested aggregations in groupby" - ) # pragma: no cover; check_agg trips first - if (isminmax := self.name in {"min", "max"}) and self.options: - raise NotImplementedError("Nan propagation in groupby for min/max") - (child,) = self.children - ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests - request = self.request - # These are handled specially here because we don't set up the - # request for the whole-frame agg because we can avoid a - # reduce for these. - if self.name == "first": - request = plc.aggregation.nth_element( - 0, null_handling=plc.types.NullPolicy.INCLUDE - ) - elif self.name == "last": - request = plc.aggregation.nth_element( - -1, null_handling=plc.types.NullPolicy.INCLUDE - ) - if request is None: - raise NotImplementedError( - f"Aggregation {self.name} in groupby" - ) # pragma: no cover; __init__ trips first - if isminmax and plc.traits.is_floating_point(self.dtype): - assert expr is not None - # Ignore nans in these groupby aggs, do this by masking - # nans in the input - expr = UnaryFunction(self.dtype, "mask_nans", (), expr) - return AggInfo([(expr, request, self)]) - - def _reduce( - self, column: Column, *, request: plc.aggregation.Aggregation - ) -> Column: - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, request, self.dtype), - 1, - ) - ) - - def _count(self, column: Column) -> Column: - return Column( - plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar( - column.obj.size() - column.obj.null_count(), - type=plc.interop.to_arrow(self.dtype), - ), - ), - 1, - ) - ) - - def _min(self, column: Column, *, propagate_nans: bool) -> Column: - if propagate_nans and column.nan_count > 0: - return Column( - plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) - ), - 1, - ) - ) - if column.nan_count > 0: - column = column.mask_nans() - return self._reduce(column, request=plc.aggregation.min()) - - def _max(self, column: Column, *, propagate_nans: bool) -> Column: - if propagate_nans and column.nan_count > 0: - return Column( - plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) - ), - 1, - ) - ) - if column.nan_count > 0: - column = column.mask_nans() - return self._reduce(column, request=plc.aggregation.max()) - - def _first(self, column: Column) -> Column: - return Column(plc.copying.slice(column.obj, [0, 1])[0]) - - def _last(self, column: Column) -> Column: - n = column.obj.size() - return Column(plc.copying.slice(column.obj, [n - 1, n])[0]) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - if context is not ExecutionContext.FRAME: - raise NotImplementedError( - f"Agg in context {context}" - ) # pragma: no cover; unreachable - - # Aggregations like quantiles may have additional children that were - # preprocessed into pylibcudf requests. - child = self.children[0] - return self.op(child.evaluate(df, context=context, mapping=mapping)) - - -class Ternary(Expr): - __slots__ = ("children",) - _non_child = ("dtype",) - children: tuple[Expr, Expr, Expr] - - def __init__( - self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr - ) -> None: - super().__init__(dtype) - self.children = (when, then, otherwise) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - when, then, otherwise = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - then_obj = then.obj_scalar if then.is_scalar else then.obj - otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj - return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj)) - - -class BinOp(Expr): - __slots__ = ("op", "children") - _non_child = ("dtype", "op") - children: tuple[Expr, Expr] - - def __init__( - self, - dtype: plc.DataType, - op: plc.binaryop.BinaryOperator, - left: Expr, - right: Expr, - ) -> None: - super().__init__(dtype) - if plc.traits.is_boolean(self.dtype): - # For boolean output types, bitand and bitor implement - # boolean logic, so translate. bitxor also does, but the - # default behaviour is correct. - op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) - self.op = op - self.children = (left, right) - if not plc.binaryop.is_supported_operation( - self.dtype, left.dtype, right.dtype, op - ): - raise NotImplementedError( - f"Operation {op.name} not supported " - f"for types {left.dtype.id().name} and {right.dtype.id().name} " - f"with output type {self.dtype.id().name}" - ) - - _BOOL_KLEENE_MAPPING: ClassVar[ - dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator] - ] = { - plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, - plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, - plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, - plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, - } - - _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { - pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, - pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, - pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, - pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, - pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS, - pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL, - pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER, - pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL, - pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD, - pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB, - pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL, - pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV, - pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV, - pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV, - pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD, - pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND, - pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR, - pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR, - pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND, - pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, - } - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - left, right = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - lop = left.obj - rop = right.obj - if left.obj.size() != right.obj.size(): - if left.is_scalar: - lop = left.obj_scalar - elif right.is_scalar: - rop = right.obj_scalar - return Column( - plc.binaryop.binary_operation(lop, rop, self.op, self.dtype), - ) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - if depth == 1: - # inside aggregation, need to pre-evaluate, - # groupby construction has checked that we don't have - # nested aggs, so stop the recursion and return ourselves - # for pre-eval - return AggInfo([(self, plc.aggregation.collect_list(), self)]) - else: - left_info, right_info = ( - child.collect_agg(depth=depth) for child in self.children - ) - requests = [*left_info.requests, *right_info.requests] - # TODO: Hack, if there were no reductions inside this - # binary expression then we want to pre-evaluate and - # collect ourselves. Otherwise we want to collect the - # aggregations inside and post-evaluate. This is a bad way - # of checking that we are in case 1. - if all( - agg.kind() == plc.aggregation.Kind.COLLECT_LIST - for _, agg, _ in requests - ): - return AggInfo([(self, plc.aggregation.collect_list(), self)]) - return AggInfo( - [*left_info.requests, *right_info.requests], - ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py new file mode 100644 index 00000000000..acbea129088 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Implementations of various expressions.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py new file mode 100644 index 00000000000..2af9fdaacc5 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for aggregations.""" + +from __future__ import annotations + +from functools import partial +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ( + AggInfo, + ExecutionContext, + Expr, +) +from cudf_polars.dsl.expressions.literal import Literal +from cudf_polars.dsl.expressions.unary import UnaryFunction + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["Agg"] + + +class Agg(Expr): + __slots__ = ("name", "options", "op", "request") + _non_child = ("dtype", "name", "options") + + def __init__( + self, dtype: plc.DataType, name: str, options: Any, *children: Expr + ) -> None: + self.dtype = dtype + self.name = name + self.options = options + self.children = children + if name not in Agg._SUPPORTED: + raise NotImplementedError( + f"Unsupported aggregation {name=}" + ) # pragma: no cover; all valid aggs are supported + # TODO: nan handling in groupby case + if name == "min": + req = plc.aggregation.min() + elif name == "max": + req = plc.aggregation.max() + elif name == "median": + req = plc.aggregation.median() + elif name == "n_unique": + # TODO: datatype of result + req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) + elif name == "first" or name == "last": + req = None + elif name == "mean": + req = plc.aggregation.mean() + elif name == "sum": + req = plc.aggregation.sum() + elif name == "std": + # TODO: handle nans + req = plc.aggregation.std(ddof=options) + elif name == "var": + # TODO: handle nans + req = plc.aggregation.variance(ddof=options) + elif name == "count": + req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) + elif name == "quantile": + _, quantile = self.children + if not isinstance(quantile, Literal): + raise NotImplementedError("Only support literal quantile values") + req = plc.aggregation.quantile( + quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options] + ) + else: + raise NotImplementedError( + f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" + ) # pragma: no cover + self.request = req + op = getattr(self, f"_{name}", None) + if op is None: + op = partial(self._reduce, request=req) + elif name in {"min", "max"}: + op = partial(op, propagate_nans=options) + elif name in {"count", "first", "last"}: + pass + else: + raise NotImplementedError( + f"Unreachable, supported agg {name=} has no implementation" + ) # pragma: no cover + self.op = op + + _SUPPORTED: ClassVar[frozenset[str]] = frozenset( + [ + "min", + "max", + "median", + "n_unique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + "quantile", + ] + ) + + interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = { + "nearest": plc.types.Interpolation.NEAREST, + "higher": plc.types.Interpolation.HIGHER, + "lower": plc.types.Interpolation.LOWER, + "midpoint": plc.types.Interpolation.MIDPOINT, + "linear": plc.types.Interpolation.LINEAR, + } + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth >= 1: + raise NotImplementedError( + "Nested aggregations in groupby" + ) # pragma: no cover; check_agg trips first + if (isminmax := self.name in {"min", "max"}) and self.options: + raise NotImplementedError("Nan propagation in groupby for min/max") + (child,) = self.children + ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests + request = self.request + # These are handled specially here because we don't set up the + # request for the whole-frame agg because we can avoid a + # reduce for these. + if self.name == "first": + request = plc.aggregation.nth_element( + 0, null_handling=plc.types.NullPolicy.INCLUDE + ) + elif self.name == "last": + request = plc.aggregation.nth_element( + -1, null_handling=plc.types.NullPolicy.INCLUDE + ) + if request is None: + raise NotImplementedError( + f"Aggregation {self.name} in groupby" + ) # pragma: no cover; __init__ trips first + if isminmax and plc.traits.is_floating_point(self.dtype): + assert expr is not None + # Ignore nans in these groupby aggs, do this by masking + # nans in the input + expr = UnaryFunction(self.dtype, "mask_nans", (), expr) + return AggInfo([(expr, request, self)]) + + def _reduce( + self, column: Column, *, request: plc.aggregation.Aggregation + ) -> Column: + return Column( + plc.Column.from_scalar( + plc.reduce.reduce(column.obj, request, self.dtype), + 1, + ) + ) + + def _count(self, column: Column) -> Column: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar( + column.obj.size() - column.obj.null_count(), + type=plc.interop.to_arrow(self.dtype), + ), + ), + 1, + ) + ) + + def _min(self, column: Column, *, propagate_nans: bool) -> Column: + if propagate_nans and column.nan_count > 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + if column.nan_count > 0: + column = column.mask_nans() + return self._reduce(column, request=plc.aggregation.min()) + + def _max(self, column: Column, *, propagate_nans: bool) -> Column: + if propagate_nans and column.nan_count > 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + if column.nan_count > 0: + column = column.mask_nans() + return self._reduce(column, request=plc.aggregation.max()) + + def _first(self, column: Column) -> Column: + return Column(plc.copying.slice(column.obj, [0, 1])[0]) + + def _last(self, column: Column) -> Column: + n = column.obj.size() + return Column(plc.copying.slice(column.obj, [n - 1, n])[0]) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if context is not ExecutionContext.FRAME: + raise NotImplementedError( + f"Agg in context {context}" + ) # pragma: no cover; unreachable + + # Aggregations like quantiles may have additional children that were + # preprocessed into pylibcudf requests. + child = self.children[0] + return self.op(child.evaluate(df, context=context, mapping=mapping)) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py new file mode 100644 index 00000000000..23851f91938 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -0,0 +1,295 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Base and common classes for expression DSL nodes.""" + +from __future__ import annotations + +import enum +from enum import IntEnum +from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.nodebase import Node + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import Column, DataFrame + +__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext", "ColRef"] + + +class AggInfo(NamedTuple): + requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]] + + +class ExecutionContext(IntEnum): + FRAME = enum.auto() + GROUPBY = enum.auto() + ROLLING = enum.auto() + + +class Expr(Node["Expr"]): + """An abstract expression object.""" + + __slots__ = ("dtype",) + dtype: plc.DataType + """Data type of the expression.""" + # This annotation is needed because of https://github.com/python/mypy/issues/17981 + _non_child: ClassVar[tuple[str, ...]] = ("dtype",) + """Names of non-child data (not Exprs) for reconstruction.""" + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame that will provide columns. + context + What context are we performing this evaluation in? + mapping + Substitution mapping from expressions to Columns, used to + override the evaluation of a given expression if we're + performing a simple rewritten evaluation. + + Notes + ----- + Do not call this function directly, but rather + :meth:`evaluate` which handles the mapping lookups. + + Returns + ------- + Column representing the evaluation of the expression. + + Raises + ------ + NotImplementedError + If we couldn't evaluate the expression. Ideally all these + are returned during translation to the IR, but for now we + are not perfect. + """ + raise NotImplementedError( + f"Evaluation of expression {type(self).__name__}" + ) # pragma: no cover; translation of unimplemented nodes trips first + + def evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame that will provide columns. + context + What context are we performing this evaluation in? + mapping + Substitution mapping from expressions to Columns, used to + override the evaluation of a given expression if we're + performing a simple rewritten evaluation. + + Notes + ----- + Individual subclasses should implement :meth:`do_evaluate`, + this method provides logic to handle lookups in the + substitution mapping. + + Returns + ------- + Column representing the evaluation of the expression. + + Raises + ------ + NotImplementedError + If we couldn't evaluate the expression. Ideally all these + are returned during translation to the IR, but for now we + are not perfect. + """ + if mapping is None: + return self.do_evaluate(df, context=context, mapping=mapping) + try: + return mapping[self] + except KeyError: + return self.do_evaluate(df, context=context, mapping=mapping) + + def collect_agg(self, *, depth: int) -> AggInfo: + """ + Collect information about aggregations in groupbys. + + Parameters + ---------- + depth + The depth of aggregating (reduction or sampling) + expressions we are currently at. + + Returns + ------- + Aggregation info describing the expression to aggregate in the + groupby. + + Raises + ------ + NotImplementedError + If we can't currently perform the aggregation request, for + example nested aggregations like ``a.max().min()``. + """ + raise NotImplementedError( + f"Collecting aggregation info for {type(self).__name__}" + ) # pragma: no cover; check_agg trips first + + +class ErrorExpr(Expr): + __slots__ = ("error",) + _non_child = ("dtype", "error") + error: str + + def __init__(self, dtype: plc.DataType, error: str) -> None: + self.dtype = dtype + self.error = error + self.children = () + + +class NamedExpr: + # NamedExpr does not inherit from Expr since it does not appear + # when evaluating expressions themselves, only when constructing + # named return values in dataframe (IR) nodes. + __slots__ = ("name", "value") + value: Expr + name: str + + def __init__(self, name: str, value: Expr) -> None: + self.name = name + self.value = value + + def __hash__(self) -> int: + """Hash of the expression.""" + return hash((type(self), self.name, self.value)) + + def __repr__(self) -> str: + """Repr of the expression.""" + return f"NamedExpr({self.name}, {self.value})" + + def __eq__(self, other: Any) -> bool: + """Equality of two expressions.""" + return ( + type(self) is type(other) + and self.name == other.name + and self.value == other.value + ) + + def __ne__(self, other: Any) -> bool: + """Inequality of expressions.""" + return not self.__eq__(other) + + def evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame providing context + context + Execution context + mapping + Substitution mapping + + Returns + ------- + Evaluated Column with name attached. + + See Also + -------- + :meth:`Expr.evaluate` for details, this function just adds the + name to a column produced from an expression. + """ + return self.value.evaluate(df, context=context, mapping=mapping).rename( + self.name + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return self.value.collect_agg(depth=depth) + + +class Col(Expr): + __slots__ = ("name",) + _non_child = ("dtype", "name") + name: str + + def __init__(self, dtype: plc.DataType, name: str) -> None: + self.dtype = dtype + self.name = name + self.children = () + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # Deliberately remove the name here so that we guarantee + # evaluation of the IR produces names. + return df.column_map[self.name].rename(None) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + + +class ColRef(Expr): + __slots__ = ("index", "table_ref") + _non_child = ("dtype", "index", "table_ref") + index: int + table_ref: plc.expressions.TableReference + + def __init__( + self, + dtype: plc.DataType, + index: int, + table_ref: plc.expressions.TableReference, + column: Expr, + ) -> None: + if not isinstance(column, Col): + raise TypeError("Column reference should only apply to columns") + self.dtype = dtype + self.index = index + self.table_ref = table_ref + self.children = (column,) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + raise NotImplementedError( + "Only expect this node as part of an expression translated to libcudf AST." + ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py new file mode 100644 index 00000000000..245bdbefe88 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""BinaryOp DSL nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, ClassVar + +from polars.polars import _expr_nodes as pl_expr + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["BinOp"] + + +class BinOp(Expr): + __slots__ = ("op",) + _non_child = ("dtype", "op") + + def __init__( + self, + dtype: plc.DataType, + op: plc.binaryop.BinaryOperator, + left: Expr, + right: Expr, + ) -> None: + self.dtype = dtype + if plc.traits.is_boolean(self.dtype): + # For boolean output types, bitand and bitor implement + # boolean logic, so translate. bitxor also does, but the + # default behaviour is correct. + op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) + self.op = op + self.children = (left, right) + if not plc.binaryop.is_supported_operation( + self.dtype, left.dtype, right.dtype, op + ): + raise NotImplementedError( + f"Operation {op.name} not supported " + f"for types {left.dtype.id().name} and {right.dtype.id().name} " + f"with output type {self.dtype.id().name}" + ) + + _BOOL_KLEENE_MAPPING: ClassVar[ + dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator] + ] = { + plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + } + + _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { + pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, + pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, + pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, + pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, + pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS, + pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL, + pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER, + pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL, + pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD, + pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB, + pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL, + pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV, + pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV, + pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV, + pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD, + pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND, + pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR, + pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR, + pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND, + pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, + } + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + left, right = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + lop = left.obj + rop = right.obj + if left.obj.size() != right.obj.size(): + if left.is_scalar: + lop = left.obj_scalar + elif right.is_scalar: + rop = right.obj_scalar + return Column( + plc.binaryop.binary_operation(lop, rop, self.op, self.dtype), + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth == 1: + # inside aggregation, need to pre-evaluate, + # groupby construction has checked that we don't have + # nested aggs, so stop the recursion and return ourselves + # for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + left_info, right_info = ( + child.collect_agg(depth=depth) for child in self.children + ) + requests = [*left_info.requests, *right_info.requests] + # TODO: Hack, if there were no reductions inside this + # binary expression then we want to pre-evaluate and + # collect ourselves. Otherwise we want to collect the + # aggregations inside and post-evaluate. This is a bad way + # of checking that we are in case 1. + if all( + agg.kind() == plc.aggregation.Kind.COLLECT_LIST + for _, agg, _ in requests + ): + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + return AggInfo( + [*left_info.requests, *right_info.requests], + ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py new file mode 100644 index 00000000000..8db8172ebd1 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -0,0 +1,269 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Boolean DSL nodes.""" + +from __future__ import annotations + +from functools import partial, reduce +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa + +from polars.polars import _expr_nodes as pl_expr + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ( + ExecutionContext, + Expr, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + + import polars.type_aliases as pl_types + + from cudf_polars.containers import DataFrame + +__all__ = ["BooleanFunction"] + + +class BooleanFunction(Expr): + __slots__ = ("name", "options") + _non_child = ("dtype", "name", "options") + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.BooleanFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + self.dtype = dtype + self.options = options + self.name = name + self.children = children + if self.name == pl_expr.BooleanFunction.IsIn and not all( + c.dtype == self.children[0].dtype for c in self.children + ): + # TODO: If polars IR doesn't put the casts in, we need to + # mimic the supertype promotion rules. + raise NotImplementedError("IsIn doesn't support supertype casting") + + @staticmethod + def _distinct( + column: Column, + *, + keep: plc.stream_compaction.DuplicateKeepOption, + source_value: plc.Scalar, + target_value: plc.Scalar, + ) -> Column: + table = plc.Table([column.obj]) + indices = plc.stream_compaction.distinct_indices( + table, + keep, + # TODO: polars doesn't expose options for these + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + return Column( + plc.copying.scatter( + [source_value], + indices, + plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]), + ).columns()[0] + ) + + _BETWEEN_OPS: ClassVar[ + dict[ + pl_types.ClosedInterval, + tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator], + ] + ] = { + "none": ( + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS, + ), + "left": ( + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.LESS, + ), + "right": ( + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS_EQUAL, + ), + "both": ( + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.LESS_EQUAL, + ), + } + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name in ( + pl_expr.BooleanFunction.IsFinite, + pl_expr.BooleanFunction.IsInfinite, + ): + # Avoid evaluating the child if the dtype tells us it's unnecessary. + (child,) = self.children + is_finite = self.name == pl_expr.BooleanFunction.IsFinite + if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): + value = plc.interop.from_arrow( + pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype)) + ) + return Column(plc.Column.from_scalar(value, df.num_rows)) + needles = child.evaluate(df, context=context, mapping=mapping) + to_search = [-float("inf"), float("inf")] + if is_finite: + # NaN is neither finite not infinite + to_search.append(float("nan")) + haystack = plc.interop.from_arrow( + pa.array( + to_search, + type=plc.interop.to_arrow(needles.obj.type()), + ) + ) + result = plc.search.contains(haystack, needles.obj) + if is_finite: + result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT) + return Column(result) + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + # Kleene logic for Any (OR) and All (AND) if ignore_nulls is + # False + if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): + (ignore_nulls,) = self.options + (column,) = columns + is_any = self.name == pl_expr.BooleanFunction.Any + agg = plc.aggregation.any() if is_any else plc.aggregation.all() + result = plc.reduce.reduce(column.obj, agg, self.dtype) + if not ignore_nulls and column.obj.null_count() > 0: + # Truth tables + # Any All + # | F U T | F U T + # --+------ --+------ + # F | F U T F | F F F + # U | U U T U | F U U + # T | T T T T | F U T + # + # If the input null count was non-zero, we must + # post-process the result to insert the correct value. + h_result = plc.interop.to_arrow(result).as_py() + if is_any and not h_result or not is_any and h_result: + # Any All + # False || Null => Null True && Null => Null + return Column(plc.Column.all_null_like(column.obj, 1)) + return Column(plc.Column.from_scalar(result, 1)) + if self.name == pl_expr.BooleanFunction.IsNull: + (column,) = columns + return Column(plc.unary.is_null(column.obj)) + elif self.name == pl_expr.BooleanFunction.IsNotNull: + (column,) = columns + return Column(plc.unary.is_valid(column.obj)) + elif self.name == pl_expr.BooleanFunction.IsNan: + (column,) = columns + return Column( + plc.unary.is_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) + elif self.name == pl_expr.BooleanFunction.IsNotNan: + (column,) = columns + return Column( + plc.unary.is_not_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) + elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + ) + elif self.name == pl_expr.BooleanFunction.IsLastDistinct: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + ) + elif self.name == pl_expr.BooleanFunction.IsUnique: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + ) + elif self.name == pl_expr.BooleanFunction.IsDuplicated: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + source_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + ) + elif self.name == pl_expr.BooleanFunction.AllHorizontal: + return Column( + reduce( + partial( + plc.binaryop.binary_operation, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + output_type=self.dtype, + ), + (c.obj for c in columns), + ) + ) + elif self.name == pl_expr.BooleanFunction.AnyHorizontal: + return Column( + reduce( + partial( + plc.binaryop.binary_operation, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + output_type=self.dtype, + ), + (c.obj for c in columns), + ) + ) + elif self.name == pl_expr.BooleanFunction.IsIn: + needles, haystack = columns + return Column(plc.search.contains(haystack.obj, needles.obj)) + elif self.name == pl_expr.BooleanFunction.Not: + (column,) = columns + return Column( + plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT) + ) + else: + raise NotImplementedError( + f"BooleanFunction {self.name}" + ) # pragma: no cover; handled by init raising diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py new file mode 100644 index 00000000000..cd8e5c6a4eb --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for datetime operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa + +from polars.polars import _expr_nodes as pl_expr + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ExecutionContext, Expr + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["TemporalFunction"] + + +class TemporalFunction(Expr): + __slots__ = ("name", "options") + _COMPONENT_MAP: ClassVar[ + dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent] + ] = { + pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, + pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, + pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, + pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY, + pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR, + pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE, + pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND, + pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND, + pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND, + pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND, + } + _non_child = ("dtype", "name", "options") + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.TemporalFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + self.dtype = dtype + self.options = options + self.name = name + self.children = children + if self.name not in self._COMPONENT_MAP: + raise NotImplementedError(f"Temporal function {self.name}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + (column,) = columns + if self.name == pl_expr.TemporalFunction.Microsecond: + millis = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MILLISECOND + ) + micros = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MICROSECOND + ) + millis_as_micros = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.DataType(plc.TypeId.INT32), + ) + total_micros = plc.binaryop.binary_operation( + micros, + millis_as_micros, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_micros) + elif self.name == pl_expr.TemporalFunction.Nanosecond: + millis = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MILLISECOND + ) + micros = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MICROSECOND + ) + nanos = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.NANOSECOND + ) + millis_as_nanos = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + micros_as_nanos = plc.binaryop.binary_operation( + micros, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + nanos, + millis_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + total_nanos, + micros_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_nanos) + + return Column( + plc.datetime.extract_datetime_component( + column.obj, + self._COMPONENT_MAP[self.name], + ) + ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py new file mode 100644 index 00000000000..7eba0c110ab --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Literal DSL nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import pyarrow as pa + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr +from cudf_polars.utils import dtypes + +if TYPE_CHECKING: + from collections.abc import Hashable, Mapping + + import pyarrow as pa + + import polars as pl + + from cudf_polars.containers import DataFrame + +__all__ = ["Literal", "LiteralColumn"] + + +class Literal(Expr): + __slots__ = ("value",) + _non_child = ("dtype", "value") + value: pa.Scalar[Any] + + def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None: + self.dtype = dtype + assert value.type == plc.interop.to_arrow(dtype) + self.value = value + self.children = () + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # datatype of pyarrow scalar is correct by construction. + return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1)) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([]) + + +class LiteralColumn(Expr): + __slots__ = ("value",) + _non_child = ("dtype", "value") + value: pa.Array[Any] + + def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: + self.dtype = dtype + data = value.to_arrow() + self.value = data.cast(dtypes.downcast_arrow_lists(data.type)) + self.children = () + + def get_hashable(self) -> Hashable: + """Compute a hash of the column.""" + # This is stricter than necessary, but we only need this hash + # for identity in groupby replacements so it's OK. And this + # way we avoid doing potentially expensive compute. + return (type(self), self.dtype, id(self.value)) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # datatype of pyarrow array is correct by construction. + return Column(plc.interop.from_arrow(self.value)) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([]) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py new file mode 100644 index 00000000000..fa68bcb9426 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Rolling DSL nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from cudf_polars.dsl.expressions.base import Expr + +if TYPE_CHECKING: + import pylibcudf as plc + +__all__ = ["RollingWindow", "GroupedRollingWindow"] + + +class RollingWindow(Expr): + __slots__ = ("options",) + _non_child = ("dtype", "options") + + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None: + self.dtype = dtype + self.options = options + self.children = (agg,) + raise NotImplementedError("Rolling window not implemented") + + +class GroupedRollingWindow(Expr): + __slots__ = ("options",) + _non_child = ("dtype", "options") + + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None: + self.dtype = dtype + self.options = options + self.children = (agg, *by) + raise NotImplementedError("Grouped rolling window not implemented") diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py new file mode 100644 index 00000000000..77d7d4c0d22 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for selection operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pyarrow as pa + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ExecutionContext, Expr + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["Gather", "Filter"] + + +class Gather(Expr): + __slots__ = () + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None: + self.dtype = dtype + self.children = (values, indices) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + values, indices = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + lo, hi = plc.reduce.minmax(indices.obj) + lo = plc.interop.to_arrow(lo).as_py() + hi = plc.interop.to_arrow(hi).as_py() + n = df.num_rows + if hi >= n or lo < -n: + raise ValueError("gather indices are out of bounds") + if indices.obj.null_count(): + bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY + obj = plc.replace.replace_nulls( + indices.obj, + plc.interop.from_arrow( + pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type())) + ), + ) + else: + bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK + obj = indices.obj + table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) + return Column(table.columns()[0]) + + +class Filter(Expr): + __slots__ = () + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): + self.dtype = dtype + self.children = (values, indices) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + values, mask = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + table = plc.stream_compaction.apply_boolean_mask( + plc.Table([values.obj]), mask.obj + ) + return Column(table.columns()[0]).sorted_like(values) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py new file mode 100644 index 00000000000..99512e2ef52 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Sorting DSL nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ExecutionContext, Expr +from cudf_polars.utils import sorting + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["Sort", "SortBy"] + + +class Sort(Expr): + __slots__ = ("options",) + _non_child = ("dtype", "options") + + def __init__( + self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr + ) -> None: + self.dtype = dtype + self.options = options + self.children = (column,) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + (stable, nulls_last, descending) = self.options + order, null_order = sorting.sort_order( + [descending], nulls_last=[nulls_last], num_keys=1 + ) + do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort + table = do_sort(plc.Table([column.obj]), order, null_order) + return Column( + table.columns()[0], + is_sorted=plc.types.Sorted.YES, + order=order[0], + null_order=null_order[0], + ) + + +class SortBy(Expr): + __slots__ = ("options",) + _non_child = ("dtype", "options") + + def __init__( + self, + dtype: plc.DataType, + options: tuple[bool, tuple[bool], tuple[bool]], + column: Expr, + *by: Expr, + ) -> None: + self.dtype = dtype + self.options = options + self.children = (column, *by) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + column, *by = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + (stable, nulls_last, descending) = self.options + order, null_order = sorting.sort_order( + descending, nulls_last=nulls_last, num_keys=len(by) + ) + do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + table = do_sort( + plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order + ) + return Column(table.columns()[0]) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py new file mode 100644 index 00000000000..8b66c9d4676 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -0,0 +1,283 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for string operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import pyarrow as pa +import pyarrow.compute as pc + +from polars.exceptions import InvalidOperationError +from polars.polars import _expr_nodes as pl_expr + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ExecutionContext, Expr +from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["StringFunction"] + + +class StringFunction(Expr): + __slots__ = ("name", "options", "_regex_program") + _non_child = ("dtype", "name", "options") + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.StringFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + self.dtype = dtype + self.options = options + self.name = name + self.children = children + self._validate_input() + + def _validate_input(self): + if self.name not in ( + pl_expr.StringFunction.Contains, + pl_expr.StringFunction.EndsWith, + pl_expr.StringFunction.Lowercase, + pl_expr.StringFunction.Replace, + pl_expr.StringFunction.ReplaceMany, + pl_expr.StringFunction.Slice, + pl_expr.StringFunction.Strptime, + pl_expr.StringFunction.StartsWith, + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + pl_expr.StringFunction.Uppercase, + ): + raise NotImplementedError(f"String function {self.name}") + if self.name == pl_expr.StringFunction.Contains: + literal, strict = self.options + if not literal: + if not strict: + raise NotImplementedError( + "f{strict=} is not supported for regex contains" + ) + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "Regex contains only supports a scalar pattern" + ) + pattern = self.children[1].value.as_py() + try: + self._regex_program = plc.strings.regex_program.RegexProgram.create( + pattern, + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + except RuntimeError as e: + raise NotImplementedError( + f"Unsupported regex {pattern} for GPU engine." + ) from e + elif self.name == pl_expr.StringFunction.Replace: + _, literal = self.options + if not literal: + raise NotImplementedError("literal=False is not supported for replace") + if not all(isinstance(expr, Literal) for expr in self.children[1:]): + raise NotImplementedError("replace only supports scalar target") + target = self.children[1] + if target.value == pa.scalar("", type=pa.string()): + raise NotImplementedError( + "libcudf replace does not support empty strings" + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + (ascii_case_insensitive,) = self.options + if ascii_case_insensitive: + raise NotImplementedError( + "ascii_case_insensitive not implemented for replace_many" + ) + if not all( + isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:] + ): + raise NotImplementedError("replace_many only supports literal inputs") + target = self.children[1] + if pc.any(pc.equal(target.value, "")).as_py(): + raise NotImplementedError( + "libcudf replace_many is implemented differently from polars " + "for empty strings" + ) + elif self.name == pl_expr.StringFunction.Slice: + if not all(isinstance(child, Literal) for child in self.children[1:]): + raise NotImplementedError( + "Slice only supports literal start and stop values" + ) + elif self.name == pl_expr.StringFunction.Strptime: + format, _, exact, cache = self.options + if cache: + raise NotImplementedError("Strptime cache is a CPU feature") + if format is None: + raise NotImplementedError("Strptime format is required") + if not exact: + raise NotImplementedError("Strptime does not support exact=False") + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "strip operations only support scalar patterns" + ) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name == pl_expr.StringFunction.Contains: + child, arg = self.children + column = child.evaluate(df, context=context, mapping=mapping) + + literal, _ = self.options + if literal: + pat = arg.evaluate(df, context=context, mapping=mapping) + pattern = ( + pat.obj_scalar + if pat.is_scalar and pat.obj.size() != column.obj.size() + else pat.obj + ) + return Column(plc.strings.find.contains(column.obj, pattern)) + else: + return Column( + plc.strings.contains.contains_re(column.obj, self._regex_program) + ) + elif self.name == pl_expr.StringFunction.Slice: + child, expr_offset, expr_length = self.children + assert isinstance(expr_offset, Literal) + assert isinstance(expr_length, Literal) + + column = child.evaluate(df, context=context, mapping=mapping) + # libcudf slices via [start,stop). + # polars slices with offset + length where start == offset + # stop = start + length. Negative values for start look backward + # from the last element of the string. If the end index would be + # below zero, an empty string is returned. + # Do this maths on the host + start = expr_offset.value.as_py() + length = expr_length.value.as_py() + + if length == 0: + stop = start + else: + # No length indicates a scan to the end + # The libcudf equivalent is a null stop + stop = start + length if length else None + if length and start < 0 and length >= -start: + stop = None + return Column( + plc.strings.slice.slice_strings( + column.obj, + plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), + plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), + ) + ) + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = ( + c.evaluate(df, context=context, mapping=mapping) for c in self.children + ) + if self.name == pl_expr.StringFunction.StripCharsStart: + side = plc.strings.SideType.LEFT + elif self.name == pl_expr.StringFunction.StripCharsEnd: + side = plc.strings.SideType.RIGHT + else: + side = plc.strings.SideType.BOTH + return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar)) + + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.StringFunction.Lowercase: + (column,) = columns + return Column(plc.strings.case.to_lower(column.obj)) + elif self.name == pl_expr.StringFunction.Uppercase: + (column,) = columns + return Column(plc.strings.case.to_upper(column.obj)) + elif self.name == pl_expr.StringFunction.EndsWith: + column, suffix = columns + return Column( + plc.strings.find.ends_with( + column.obj, + suffix.obj_scalar + if column.obj.size() != suffix.obj.size() and suffix.is_scalar + else suffix.obj, + ) + ) + elif self.name == pl_expr.StringFunction.StartsWith: + column, prefix = columns + return Column( + plc.strings.find.starts_with( + column.obj, + prefix.obj_scalar + if column.obj.size() != prefix.obj.size() and prefix.is_scalar + else prefix.obj, + ) + ) + elif self.name == pl_expr.StringFunction.Strptime: + # TODO: ignores ambiguous + format, strict, exact, cache = self.options + col = self.children[0].evaluate(df, context=context, mapping=mapping) + + is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( + col.obj, format + ) + + if strict: + if not plc.interop.to_arrow( + plc.reduce.reduce( + is_timestamps, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("conversion from `str` failed.") + else: + not_timestamps = plc.unary.unary_operation( + is_timestamps, plc.unary.UnaryOperator.NOT + ) + + null = plc.interop.from_arrow(pa.scalar(None, type=pa.string())) + res = plc.copying.boolean_mask_scatter( + [null], plc.Table([col.obj]), not_timestamps + ) + return Column( + plc.strings.convert.convert_datetime.to_timestamps( + res.columns()[0], self.dtype, format + ) + ) + elif self.name == pl_expr.StringFunction.Replace: + column, target, repl = columns + n, _ = self.options + return Column( + plc.strings.replace.replace( + column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n + ) + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + column, target, repl = columns + return Column( + plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) + ) + raise NotImplementedError( + f"StringFunction {self.name}" + ) # pragma: no cover; handled by init raising diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py new file mode 100644 index 00000000000..d2b5d6bae29 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for ternary operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ( + ExecutionContext, + Expr, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + + +__all__ = ["Ternary"] + + +class Ternary(Expr): + __slots__ = () + _non_child = ("dtype",) + + def __init__( + self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr + ) -> None: + self.dtype = dtype + self.children = (when, then, otherwise) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + when, then, otherwise = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + then_obj = then.obj_scalar if then.is_scalar else then.obj + otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj + return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj)) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py new file mode 100644 index 00000000000..7999ec86068 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -0,0 +1,329 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +"""DSL nodes for unary operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr +from cudf_polars.dsl.expressions.literal import Literal +from cudf_polars.utils import dtypes + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["Cast", "UnaryFunction", "Len"] + + +class Cast(Expr): + """Class representing a cast of an expression.""" + + __slots__ = () + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, value: Expr) -> None: + self.dtype = dtype + self.children = (value,) + if not dtypes.can_cast(value.dtype, self.dtype): + raise NotImplementedError( + f"Can't cast {value.dtype.id().name} to {self.dtype.id().name}" + ) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + return column.astype(self.dtype) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: Could do with sort-based groupby and segmented filter + (child,) = self.children + return child.collect_agg(depth=depth) + + +class Len(Expr): + """Class representing the length of an expression.""" + + def __init__(self, dtype: plc.DataType) -> None: + self.dtype = dtype + self.children = () + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: polars returns a uint, not an int for count + return AggInfo( + [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)] + ) + + +class UnaryFunction(Expr): + """Class representing unary functions of an expression.""" + + __slots__ = ("name", "options") + _non_child = ("dtype", "name", "options") + + # Note: log, and pow are handled via translation to binops + _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = { + "sin": plc.unary.UnaryOperator.SIN, + "cos": plc.unary.UnaryOperator.COS, + "tan": plc.unary.UnaryOperator.TAN, + "arcsin": plc.unary.UnaryOperator.ARCSIN, + "arccos": plc.unary.UnaryOperator.ARCCOS, + "arctan": plc.unary.UnaryOperator.ARCTAN, + "sinh": plc.unary.UnaryOperator.SINH, + "cosh": plc.unary.UnaryOperator.COSH, + "tanh": plc.unary.UnaryOperator.TANH, + "arcsinh": plc.unary.UnaryOperator.ARCSINH, + "arccosh": plc.unary.UnaryOperator.ARCCOSH, + "arctanh": plc.unary.UnaryOperator.ARCTANH, + "exp": plc.unary.UnaryOperator.EXP, + "sqrt": plc.unary.UnaryOperator.SQRT, + "cbrt": plc.unary.UnaryOperator.CBRT, + "ceil": plc.unary.UnaryOperator.CEIL, + "floor": plc.unary.UnaryOperator.FLOOR, + "abs": plc.unary.UnaryOperator.ABS, + "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, + "not": plc.unary.UnaryOperator.NOT, + } + _supported_misc_fns = frozenset( + { + "drop_nulls", + "fill_null", + "mask_nans", + "round", + "set_sorted", + "unique", + } + ) + _supported_cum_aggs = frozenset( + { + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + } + ) + _supported_fns = frozenset().union( + _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys() + ) + + def __init__( + self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr + ) -> None: + self.dtype = dtype + self.name = name + self.options = options + self.children = children + + if self.name not in UnaryFunction._supported_fns: + raise NotImplementedError(f"Unary function {name=}") + if self.name in UnaryFunction._supported_cum_aggs: + (reverse,) = self.options + if reverse: + raise NotImplementedError( + "reverse=True is not supported for cumulative aggregations" + ) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name == "mask_nans": + (child,) = self.children + return child.evaluate(df, context=context, mapping=mapping).mask_nans() + if self.name == "round": + (decimal_places,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.round.round( + values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP + ) + ).sorted_like(values) + elif self.name == "unique": + (maintain_order,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + # Only one column, so keep_any is the same as keep_first + # for stable distinct + keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY + if values.is_sorted: + maintain_order = True + result = plc.stream_compaction.unique( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + ) + else: + distinct = ( + plc.stream_compaction.stable_distinct + if maintain_order + else plc.stream_compaction.distinct + ) + result = distinct( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + (column,) = result.columns() + if maintain_order: + return Column(column).sorted_like(values) + return Column(column) + elif self.name == "set_sorted": + (column,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + (asc,) = self.options + order = ( + plc.types.Order.ASCENDING + if asc == "ascending" + else plc.types.Order.DESCENDING + ) + null_order = plc.types.NullOrder.BEFORE + if column.obj.null_count() > 0 and (n := column.obj.size()) > 1: + # PERF: This invokes four stream synchronisations! + has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid() + has_nulls_last = not plc.copying.get_element( + column.obj, n - 1 + ).is_valid() + if (order == plc.types.Order.DESCENDING and has_nulls_first) or ( + order == plc.types.Order.ASCENDING and has_nulls_last + ): + null_order = plc.types.NullOrder.AFTER + return column.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=order, + null_order=null_order, + ) + elif self.name == "drop_nulls": + (column,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.stream_compaction.drop_nulls( + plc.Table([column.obj]), [0], 1 + ).columns()[0] + ) + elif self.name == "fill_null": + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if isinstance(self.children[1], Literal): + arg = plc.interop.from_arrow(self.children[1].value) + else: + evaluated = self.children[1].evaluate( + df, context=context, mapping=mapping + ) + arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj + return Column(plc.replace.replace_nulls(column.obj, arg)) + elif self.name in self._OP_MAPPING: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if column.obj.type().id() != self.dtype.id(): + arg = plc.unary.cast(column.obj, self.dtype) + else: + arg = column.obj + return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name])) + elif self.name in UnaryFunction._supported_cum_aggs: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + plc_col = column.obj + col_type = column.obj.type() + # cum_sum casts + # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention + # Bool -> UInt32 + # cum_prod casts integer dtypes < int64 and bool to int64 + # See: + # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs + if ( + self.name == "cum_sum" + and col_type.id() + in { + plc.types.TypeId.INT8, + plc.types.TypeId.UINT8, + plc.types.TypeId.INT16, + plc.types.TypeId.UINT16, + } + ) or ( + self.name == "cum_prod" + and plc.traits.is_integral(col_type) + and plc.types.size_of(col_type) <= 4 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.INT64) + ) + elif ( + self.name == "cum_sum" + and column.obj.type().id() == plc.types.TypeId.BOOL8 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.UINT32) + ) + if self.name == "cum_sum": + agg = plc.aggregation.sum() + elif self.name == "cum_prod": + agg = plc.aggregation.product() + elif self.name == "cum_min": + agg = plc.aggregation.min() + elif self.name == "cum_max": + agg = plc.aggregation.max() + + return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE)) + raise NotImplementedError( + f"Unimplemented unary function {self.name=}" + ) # pragma: no cover; init trips first + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs: + raise NotImplementedError(f"{self.name} in groupby") + if depth == 1: + # inside aggregation, need to pre-evaluate, groupby + # construction has checked that we don't have nested aggs, + # so stop the recursion and return ourselves for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + (child,) = self.children + return child.collect_agg(depth=depth) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 8cd56c8ee3a..62a2da9dcea 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -13,24 +13,28 @@ from __future__ import annotations -import dataclasses import itertools +import json from functools import cache from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa -import pylibcudf as plc from typing_extensions import assert_never import polars as pl +import pylibcudf as plc + import cudf_polars.dsl.expr as expr -from cudf_polars.containers import DataFrame, NamedColumn -from cudf_polars.utils import dtypes, sorting +from cudf_polars.containers import Column, DataFrame +from cudf_polars.dsl.nodebase import Node +from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter +from cudf_polars.utils import dtypes +from cudf_polars.utils.versions import POLARS_VERSION_GT_112 if TYPE_CHECKING: - from collections.abc import Callable, MutableMapping + from collections.abc import Callable, Hashable, MutableMapping, Sequence from typing import Literal from cudf_polars.typing import Schema @@ -38,6 +42,7 @@ __all__ = [ "IR", + "ErrorNode", "PythonScan", "Scan", "Cache", @@ -45,6 +50,7 @@ "Select", "GroupBy", "Join", + "ConditionalJoin", "HStack", "Distinct", "Sort", @@ -57,9 +63,7 @@ ] -def broadcast( - *columns: NamedColumn, target_length: int | None = None -) -> list[NamedColumn]: +def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]: """ Broadcast a sequence of columns to a common length. @@ -112,31 +116,73 @@ def broadcast( return [ column if column.obj.size() != 1 - else NamedColumn( + else Column( plc.Column.from_scalar(column.obj_scalar, nrows), - column.name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE, + name=column.name, ) for column in columns ] -@dataclasses.dataclass -class IR: +class IR(Node["IR"]): """Abstract plan node, representing an unevaluated dataframe.""" + __slots__ = ("schema", "_non_child_args") + # This annotation is needed because of https://github.com/python/mypy/issues/17981 + _non_child: ClassVar[tuple[str, ...]] = ("schema",) + # Concrete classes should set this up with the arguments that will + # be passed to do_evaluate. + _non_child_args: tuple[Any, ...] schema: Schema """Mapping from column names to their data types.""" - def __post_init__(self): - """Validate preconditions.""" - pass # noqa: PIE790 + def get_hashable(self) -> Hashable: + """ + Hashable representation of node, treating schema dictionary. + + Since the schema is a dictionary, even though it is morally + immutable, it is not hashable. We therefore convert it to + tuples for hashing purposes. + """ + # Schema is the first constructor argument + args = self._ctor_arguments(self.children)[1:] + schema_hash = tuple(self.schema.items()) + return (type(self), schema_hash, args) + + # Hacky to avoid type-checking issues, just advertise the + # signature. Both mypy and pyright complain if we have an abstract + # method that takes arbitrary *args, but the subclasses have + # tighter signatures. This complaint is correct because the + # subclass is not Liskov-substitutable for the superclass. + # However, we know do_evaluate will only be called with the + # correct arguments by "construction". + do_evaluate: Callable[..., DataFrame] + """ + Evaluate the node (given its evaluated children), and return a dataframe. + + Parameters + ---------- + args + Non child arguments followed by any evaluated dataframe inputs. + + Returns + ------- + DataFrame (on device) representing the evaluation of this plan + node. + + Raises + ------ + NotImplementedError + If evaluation fails. Ideally this should not occur, since the + translation phase should fail earlier. + """ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ - Evaluate the node and return a dataframe. + Evaluate the node (recursively) and return a dataframe. Parameters ---------- @@ -144,50 +190,104 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: Mapping from cached node ids to constructed DataFrames. Used to implement evaluation of the `Cache` node. + Notes + ----- + Prefer not to override this method. Instead implement + :meth:`do_evaluate` which doesn't encode a recursion scheme + and just assumes already evaluated inputs. + Returns ------- DataFrame (on device) representing the evaluation of this plan - node. + node (and its children). Raises ------ NotImplementedError - If we couldn't evaluate things. Ideally this should not occur, - since the translation phase should pick up things that we - cannot handle. + If evaluation fails. Ideally this should not occur, since the + translation phase should fail earlier. """ - raise NotImplementedError( - f"Evaluation of plan {type(self).__name__}" - ) # pragma: no cover + return self.do_evaluate( + *self._non_child_args, + *(child.evaluate(cache=cache) for child in self.children), + ) + + +class ErrorNode(IR): + """Represents an error translating the IR.""" + + __slots__ = ("error",) + _non_child = ( + "schema", + "error", + ) + error: str + """The error.""" + + def __init__(self, schema: Schema, error: str): + self.schema = schema + self.error = error + self.children = () -@dataclasses.dataclass class PythonScan(IR): """Representation of input from a python function.""" + __slots__ = ("options", "predicate") + _non_child = ("schema", "options", "predicate") options: Any """Arbitrary options.""" predicate: expr.NamedExpr | None """Filter to apply to the constructed dataframe before returning it.""" - def __post_init__(self): - """Validate preconditions.""" + def __init__(self, schema: Schema, options: Any, predicate: expr.NamedExpr | None): + self.schema = schema + self.options = options + self.predicate = predicate + self._non_child_args = (schema, options, predicate) + self.children = () raise NotImplementedError("PythonScan not implemented") -@dataclasses.dataclass class Scan(IR): """Input from files.""" + __slots__ = ( + "typ", + "reader_options", + "cloud_options", + "config_options", + "paths", + "with_columns", + "skip_rows", + "n_rows", + "row_index", + "predicate", + ) + _non_child = ( + "schema", + "typ", + "reader_options", + "cloud_options", + "config_options", + "paths", + "with_columns", + "skip_rows", + "n_rows", + "row_index", + "predicate", + ) typ: str """What type of file are we reading? Parquet, CSV, etc...""" reader_options: dict[str, Any] """Reader-specific options, as dictionary.""" cloud_options: dict[str, Any] | None """Cloud-related authentication options, currently ignored.""" + config_options: dict[str, Any] + """GPU-specific configuration options""" paths: list[str] """List of paths to read from.""" - with_columns: list[str] + with_columns: list[str] | None """Projected columns to return.""" skip_rows: int """Rows to skip at the start when reading.""" @@ -198,9 +298,47 @@ class Scan(IR): predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" - def __post_init__(self) -> None: - """Validate preconditions.""" - super().__post_init__() + PARQUET_DEFAULT_CHUNK_SIZE: int = 0 # unlimited + PARQUET_DEFAULT_PASS_LIMIT: int = 16 * 1024**3 # 16GiB + + def __init__( + self, + schema: Schema, + typ: str, + reader_options: dict[str, Any], + cloud_options: dict[str, Any] | None, + config_options: dict[str, Any], + paths: list[str], + with_columns: list[str] | None, + skip_rows: int, + n_rows: int, + row_index: tuple[str, int] | None, + predicate: expr.NamedExpr | None, + ): + self.schema = schema + self.typ = typ + self.reader_options = reader_options + self.cloud_options = cloud_options + self.config_options = config_options + self.paths = paths + self.with_columns = with_columns + self.skip_rows = skip_rows + self.n_rows = n_rows + self.row_index = row_index + self.predicate = predicate + self._non_child_args = ( + schema, + typ, + reader_options, + config_options, + paths, + with_columns, + skip_rows, + n_rows, + row_index, + predicate, + ) + self.children = () if self.typ not in ("csv", "parquet", "ndjson"): # pragma: no cover # This line is unhittable ATM since IPC/Anonymous scan raise # on the polars side @@ -211,7 +349,7 @@ def __post_init__(self) -> None: # TODO: polars has this implemented for parquet, # maybe we can do this too? raise NotImplementedError("slice pushdown for negative slices") - if self.typ == "csv" and self.skip_rows != 0: # pragma: no cover + if self.typ in {"csv"} and self.skip_rows != 0: # pragma: no cover # This comes from slice pushdown, but that # optimization doesn't happen right now raise NotImplementedError("skipping rows in CSV reader") @@ -260,19 +398,52 @@ def __post_init__(self) -> None: "Reading only parquet metadata to produce row index." ) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + def get_hashable(self) -> Hashable: + """ + Hashable representation of the node. + + The options dictionaries are serialised for hashing purposes + as json strings. + """ + schema_hash = tuple(self.schema.items()) + return ( + type(self), + schema_hash, + self.typ, + json.dumps(self.reader_options), + json.dumps(self.cloud_options), + json.dumps(self.config_options), + tuple(self.paths), + tuple(self.with_columns) if self.with_columns is not None else None, + self.skip_rows, + self.n_rows, + self.row_index, + self.predicate, + ) + + @classmethod + def do_evaluate( + cls, + schema: Schema, + typ: str, + reader_options: dict[str, Any], + config_options: dict[str, Any], + paths: list[str], + with_columns: list[str] | None, + skip_rows: int, + n_rows: int, + row_index: tuple[str, int] | None, + predicate: expr.NamedExpr | None, + ): """Evaluate and return a dataframe.""" - with_columns = self.with_columns - row_index = self.row_index - n_rows = self.n_rows - if self.typ == "csv": - parse_options = self.reader_options["parse_options"] + if typ == "csv": + parse_options = reader_options["parse_options"] sep = chr(parse_options["separator"]) quote = chr(parse_options["quote_char"]) eol = chr(parse_options["eol_char"]) - if self.reader_options["schema"] is not None: + if reader_options["schema"] is not None: # Reader schema provides names - column_names = list(self.reader_options["schema"]["fields"].keys()) + column_names = list(reader_options["schema"]["fields"].keys()) else: # file provides column names column_names = None @@ -299,8 +470,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # polars skips blank lines at the beginning of the file pieces = [] read_partial = n_rows != -1 - for p in self.paths: - skiprows = self.reader_options["skip_rows"] + for p in paths: + skiprows = reader_options["skip_rows"] path = Path(p) with path.open() as f: while f.readline() == "\n": @@ -319,7 +490,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: skiprows=skiprows, comment=comment, decimal=decimal, - dtypes=self.schema, + dtypes=schema, nrows=n_rows, ) pieces.append(tbl_w_meta) @@ -338,24 +509,87 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.concatenate.concatenate(list(tables)), colnames[0], ) - elif self.typ == "parquet": - tbl_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo(self.paths), - columns=with_columns, - nrows=n_rows, - skip_rows=self.skip_rows, - ) - df = DataFrame.from_table( - tbl_w_meta.tbl, - # TODO: consider nested column names? - tbl_w_meta.column_names(include_children=False), - ) - elif self.typ == "ndjson": - json_schema: list[tuple[str, str, list]] = [ - (name, typ, []) for name, typ in self.schema.items() + elif typ == "parquet": + parquet_options = config_options.get("parquet_options", {}) + if parquet_options.get("chunked", True): + reader = plc.io.parquet.ChunkedParquetReader( + plc.io.SourceInfo(paths), + columns=with_columns, + # We handle skip_rows != 0 by reading from the + # up to n_rows + skip_rows and slicing off the + # first skip_rows entries. + # TODO: Remove this workaround once + # https://github.com/rapidsai/cudf/issues/16186 + # is fixed + nrows=n_rows + skip_rows, + skip_rows=0, + chunk_read_limit=parquet_options.get( + "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE + ), + pass_read_limit=parquet_options.get( + "pass_read_limit", cls.PARQUET_DEFAULT_PASS_LIMIT + ), + ) + chk = reader.read_chunk() + rows_left_to_skip = skip_rows + + def slice_skip(tbl: plc.Table): + nonlocal rows_left_to_skip + if rows_left_to_skip > 0: + table_rows = tbl.num_rows() + chunk_skip = min(rows_left_to_skip, table_rows) + # TODO: Check performance impact of skipping this + # call and creating an empty table manually when the + # slice would be empty (chunk_skip == table_rows). + (tbl,) = plc.copying.slice(tbl, [chunk_skip, table_rows]) + rows_left_to_skip -= chunk_skip + return tbl + + tbl = slice_skip(chk.tbl) + # TODO: Nested column names + names = chk.column_names(include_children=False) + concatenated_columns = tbl.columns() + while reader.has_next(): + tbl = slice_skip(reader.read_chunk().tbl) + + for i in range(tbl.num_columns()): + concatenated_columns[i] = plc.concatenate.concatenate( + [concatenated_columns[i], tbl._columns[i]] + ) + # Drop residual columns to save memory + tbl._columns[i] = None + + df = DataFrame.from_table( + plc.Table(concatenated_columns), + names=names, + ) + else: + filters = None + if predicate is not None and row_index is None: + # Can't apply filters during read if we have a row index. + filters = to_parquet_filter(predicate.value) + tbl_w_meta = plc.io.parquet.read_parquet( + plc.io.SourceInfo(paths), + columns=with_columns, + filters=filters, + nrows=n_rows, + skip_rows=skip_rows, + ) + df = DataFrame.from_table( + tbl_w_meta.tbl, + # TODO: consider nested column names? + tbl_w_meta.column_names(include_children=False), + ) + if filters is not None: + # Mask must have been applied. + return df + + elif typ == "ndjson": + json_schema: list[plc.io.json.NameAndType] = [ + (name, typ, []) for name, typ in schema.items() ] plc_tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(self.paths), + plc.io.SourceInfo(paths), lines=True, dtypes=json_schema, prune_columns=True, @@ -365,43 +599,45 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df = DataFrame.from_table( plc_tbl_w_meta.tbl, plc_tbl_w_meta.column_names(include_children=False) ) - col_order = list(self.schema.keys()) - # TODO: remove condition when dropping support for polars 1.0 - # https://github.com/pola-rs/polars/pull/17363 - if row_index is not None and row_index[0] in self.schema: + col_order = list(schema.keys()) + if row_index is not None: col_order.remove(row_index[0]) - if col_order is not None: - df = df.select(col_order) + df = df.select(col_order) else: raise NotImplementedError( - f"Unhandled scan type: {self.typ}" + f"Unhandled scan type: {typ}" ) # pragma: no cover; post init trips first if row_index is not None: name, offset = row_index - dtype = self.schema[name] + if POLARS_VERSION_GT_112: + # If we sliced away some data from the start, that + # shifts the row index. + # But prior to 1.13, polars had this wrong, so we match behaviour + # https://github.com/pola-rs/polars/issues/19607 + offset += skip_rows + dtype = schema[name] step = plc.interop.from_arrow( pa.scalar(1, type=plc.interop.to_arrow(dtype)) ) init = plc.interop.from_arrow( pa.scalar(offset, type=plc.interop.to_arrow(dtype)) ) - index = NamedColumn( + index = Column( plc.filling.sequence(df.num_rows, init, step), - name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, + name=name, ) df = DataFrame([index, *df.columns]) - assert all(c.obj.type() == self.schema[c.name] for c in df.columns) - if self.predicate is None: + assert all(c.obj.type() == schema[name] for name, c in df.column_map.items()) + if predicate is None: return df else: - (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows) + (mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows) return df.filter(mask) -@dataclasses.dataclass class Cache(IR): """ Return a cached plan node. @@ -409,20 +645,37 @@ class Cache(IR): Used for CSE at the plan level. """ + __slots__ = ("key",) + _non_child = ("schema", "key") key: int """The cache key.""" - value: IR - """The unevaluated node to cache.""" + + def __init__(self, schema: Schema, key: int, value: IR): + self.schema = schema + self.key = key + self.children = (value,) + self._non_child_args = (key,) + + @classmethod + def do_evaluate( + cls, key: int, df: DataFrame + ) -> DataFrame: # pragma: no cover; basic evaluation never calls this + """Evaluate and return a dataframe.""" + # Our value has already been computed for us, so let's just + # return it. + return df def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" + # We must override the recursion scheme because we don't want + # to recurse if we're in the cache. try: return cache[self.key] except KeyError: - return cache.setdefault(self.key, self.value.evaluate(cache=cache)) + (value,) = self.children + return cache.setdefault(self.key, value.evaluate(cache=cache)) -@dataclasses.dataclass class DataFrameScan(IR): """ Input from an existing polars DataFrame. @@ -430,52 +683,101 @@ class DataFrameScan(IR): This typically arises from ``q.collect().lazy()`` """ + __slots__ = ("df", "projection", "predicate") + _non_child = ("schema", "df", "projection", "predicate") df: Any """Polars LazyFrame object.""" - projection: list[str] + projection: tuple[str, ...] | None """List of columns to project out.""" predicate: expr.NamedExpr | None """Mask to apply.""" - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + def __init__( + self, + schema: Schema, + df: Any, + projection: Sequence[str] | None, + predicate: expr.NamedExpr | None, + ): + self.schema = schema + self.df = df + self.projection = tuple(projection) if projection is not None else None + self.predicate = predicate + self._non_child_args = (schema, df, self.projection, predicate) + self.children = () + + def get_hashable(self) -> Hashable: + """ + Hashable representation of the node. + + The (heavy) dataframe object is hashed as its id, so this is + not stable across runs, or repeat instances of the same equal dataframes. + """ + schema_hash = tuple(self.schema.items()) + return (type(self), schema_hash, id(self.df), self.projection, self.predicate) + + @classmethod + def do_evaluate( + cls, + schema: Schema, + df: Any, + projection: tuple[str, ...] | None, + predicate: expr.NamedExpr | None, + ) -> DataFrame: """Evaluate and return a dataframe.""" - pdf = pl.DataFrame._from_pydf(self.df) - if self.projection is not None: - pdf = pdf.select(self.projection) + pdf = pl.DataFrame._from_pydf(df) + if projection is not None: + pdf = pdf.select(projection) df = DataFrame.from_polars(pdf) assert all( c.obj.type() == dtype - for c, dtype in zip(df.columns, self.schema.values(), strict=True) + for c, dtype in zip(df.columns, schema.values(), strict=True) ) - if self.predicate is not None: - (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows) + if predicate is not None: + (mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows) return df.filter(mask) else: return df -@dataclasses.dataclass class Select(IR): """Produce a new dataframe selecting given expressions from an input.""" - df: IR - """Input dataframe.""" - expr: list[expr.NamedExpr] + __slots__ = ("exprs", "should_broadcast") + _non_child = ("schema", "exprs", "should_broadcast") + exprs: tuple[expr.NamedExpr, ...] """List of expressions to evaluate to form the new dataframe.""" should_broadcast: bool """Should columns be broadcast?""" - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + def __init__( + self, + schema: Schema, + exprs: Sequence[expr.NamedExpr], + should_broadcast: bool, # noqa: FBT001 + df: IR, + ): + self.schema = schema + self.exprs = tuple(exprs) + self.should_broadcast = should_broadcast + self.children = (df,) + self._non_child_args = (self.exprs, should_broadcast) + + @classmethod + def do_evaluate( + cls, + exprs: tuple[expr.NamedExpr, ...], + should_broadcast: bool, # noqa: FBT001 + df: DataFrame, + ) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) # Handle any broadcasting - columns = [e.evaluate(df) for e in self.expr] - if self.should_broadcast: + columns = [e.evaluate(df) for e in exprs] + if should_broadcast: columns = broadcast(*columns) return DataFrame(columns) -@dataclasses.dataclass class Reduce(IR): """ Produce a new dataframe selecting given expressions from an input. @@ -483,36 +785,82 @@ class Reduce(IR): This is a special case of :class:`Select` where all outputs are a single row. """ - df: IR - """Input dataframe.""" - expr: list[expr.NamedExpr] + __slots__ = ("exprs",) + _non_child = ("schema", "exprs") + exprs: tuple[expr.NamedExpr, ...] """List of expressions to evaluate to form the new dataframe.""" - def evaluate( - self, *, cache: MutableMapping[int, DataFrame] - ) -> DataFrame: # pragma: no cover; polars doesn't emit this node yet + def __init__( + self, schema: Schema, exprs: Sequence[expr.NamedExpr], df: IR + ): # pragma: no cover; polars doesn't emit this node yet + self.schema = schema + self.exprs = tuple(exprs) + self.children = (df,) + self._non_child_args = (self.exprs,) + + @classmethod + def do_evaluate( + cls, + exprs: tuple[expr.NamedExpr, ...], + df: DataFrame, + ) -> DataFrame: # pragma: no cover; not exposed by polars yet """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) - columns = broadcast(*(e.evaluate(df) for e in self.expr)) + columns = broadcast(*(e.evaluate(df) for e in exprs)) assert all(column.obj.size() == 1 for column in columns) return DataFrame(columns) -@dataclasses.dataclass class GroupBy(IR): """Perform a groupby.""" - df: IR - """Input dataframe.""" - agg_requests: list[expr.NamedExpr] - """List of expressions to evaluate groupwise.""" - keys: list[expr.NamedExpr] - """List of expressions forming the keys.""" + __slots__ = ( + "agg_requests", + "keys", + "maintain_order", + "options", + "agg_infos", + ) + _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options") + keys: tuple[expr.NamedExpr, ...] + """Grouping keys.""" + agg_requests: tuple[expr.NamedExpr, ...] + """Aggregation expressions.""" maintain_order: bool - """Should the order of the input dataframe be maintained?""" + """Preserve order in groupby.""" options: Any - """Options controlling style of groupby.""" - agg_infos: list[expr.AggInfo] = dataclasses.field(init=False) + """Arbitrary options.""" + + def __init__( + self, + schema: Schema, + keys: Sequence[expr.NamedExpr], + agg_requests: Sequence[expr.NamedExpr], + maintain_order: bool, # noqa: FBT001 + options: Any, + df: IR, + ): + self.schema = schema + self.keys = tuple(keys) + self.agg_requests = tuple(agg_requests) + self.maintain_order = maintain_order + self.options = options + self.children = (df,) + if self.options.rolling: + raise NotImplementedError( + "rolling window/groupby" + ) # pragma: no cover; rollingwindow constructor has already raised + if self.options.dynamic: + raise NotImplementedError("dynamic group by") + if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): + raise NotImplementedError("Nested aggregations in groupby") + self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] + self._non_child_args = ( + self.keys, + self.agg_requests, + maintain_order, + options, + self.agg_infos, + ) @staticmethod def check_agg(agg: expr.Expr) -> int: @@ -542,25 +890,18 @@ def check_agg(agg: expr.Expr) -> int: else: raise NotImplementedError(f"No handler for {agg=}") - def __post_init__(self) -> None: - """Check whether all the aggregations are implemented.""" - super().__post_init__() - if self.options.rolling: - raise NotImplementedError( - "rolling window/groupby" - ) # pragma: no cover; rollingwindow constructor has already raised - if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): - raise NotImplementedError("Nested aggregations in groupby") - self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] - if len(self.keys) == 0: - raise NotImplementedError("dynamic groupby") - - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + keys_in: Sequence[expr.NamedExpr], + agg_requests: Sequence[expr.NamedExpr], + maintain_order: bool, # noqa: FBT001 + options: Any, + agg_infos: Sequence[expr.AggInfo], + df: DataFrame, + ): """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) - keys = broadcast( - *(k.evaluate(df) for k in self.keys), target_length=df.num_rows - ) + keys = broadcast(*(k.evaluate(df) for k in keys_in), target_length=df.num_rows) sorted = ( plc.types.Sorted.YES if all(k.is_sorted for k in keys) @@ -576,7 +917,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # TODO: uniquify requests = [] replacements: list[expr.Expr] = [] - for info in self.agg_infos: + for info in agg_infos: for pre_eval, req, rep in info.requests: if pre_eval is None: # A count aggregation, doesn't touch the column, @@ -588,67 +929,136 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: requests.append(plc.groupby.GroupByRequest(col, [req])) replacements.append(rep) group_keys, raw_tables = grouper.aggregate(requests) - # TODO: names - raw_columns: list[NamedColumn] = [] + raw_columns: list[Column] = [] for i, table in enumerate(raw_tables): (column,) = table.columns() - raw_columns.append(NamedColumn(column, f"tmp{i}")) + raw_columns.append(Column(column, name=f"tmp{i}")) mapping = dict(zip(replacements, raw_columns, strict=True)) result_keys = [ - NamedColumn(gk, k.name) - for gk, k in zip(group_keys.columns(), keys, strict=True) + Column(grouped_key, name=key.name) + for key, grouped_key in zip(keys, group_keys.columns(), strict=True) ] result_subs = DataFrame(raw_columns) - results = [ - req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests - ] + results = [req.evaluate(result_subs, mapping=mapping) for req in agg_requests] broadcasted = broadcast(*result_keys, *results) - result_keys = broadcasted[: len(result_keys)] - results = broadcasted[len(result_keys) :] # Handle order preservation of groups - # like cudf classic does - # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743 - if self.maintain_order and not sorted: - left = plc.stream_compaction.stable_distinct( + if maintain_order and not sorted: + # The order we want + want = plc.stream_compaction.stable_distinct( plc.Table([k.obj for k in keys]), list(range(group_keys.num_columns())), plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) - right = plc.Table([key.obj for key in result_keys]) - _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL) + # The order we have + have = plc.Table([key.obj for key in broadcasted[: len(keys)]]) + + # We know an inner join is OK because by construction + # want and have are permutations of each other. + left_order, right_order = plc.join.inner_join( + want, have, plc.types.NullEquality.EQUAL + ) + # Now left_order is an arbitrary permutation of the ordering we + # want, and right_order is a matching permutation of the ordering + # we have. To get to the original ordering, we need + # left_order == iota(nrows), with right_order permuted + # appropriately. This can be obtained by sorting + # right_order by left_order. + (right_order,) = plc.sorting.sort_by_key( + plc.Table([right_order]), + plc.Table([left_order]), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER], + ).columns() ordered_table = plc.copying.gather( plc.Table([col.obj for col in broadcasted]), - indices, + right_order, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) broadcasted = [ - NamedColumn(reordered, b.name) - for reordered, b in zip( + Column(reordered, name=old.name) + for reordered, old in zip( ordered_table.columns(), broadcasted, strict=True ) ] - return DataFrame(broadcasted).slice(self.options.slice) + return DataFrame(broadcasted).slice(options.slice) + + +class ConditionalJoin(IR): + """A conditional inner join of two dataframes on a predicate.""" + + __slots__ = ("predicate", "options", "ast_predicate") + _non_child = ("schema", "predicate", "options") + predicate: expr.Expr + options: tuple + + def __init__( + self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR + ) -> None: + self.schema = schema + self.predicate = predicate + self.options = options + self.children = (left, right) + self.ast_predicate = to_ast(predicate) + _, join_nulls, zlice, suffix, coalesce = self.options + # Preconditions from polars + assert not join_nulls + assert not coalesce + if self.ast_predicate is None: + raise NotImplementedError( + f"Conditional join with predicate {predicate}" + ) # pragma: no cover; polars never delivers expressions we can't handle + self._non_child_args = (self.ast_predicate, zlice, suffix) + + @classmethod + def do_evaluate( + cls, + predicate: plc.expressions.Expression, + zlice: tuple[int, int] | None, + suffix: str, + left: DataFrame, + right: DataFrame, + ) -> DataFrame: + """Evaluate and return a dataframe.""" + lg, rg = plc.join.conditional_inner_join(left.table, right.table, predicate) + left = DataFrame.from_table( + plc.copying.gather( + left.table, lg, plc.copying.OutOfBoundsPolicy.DONT_CHECK + ), + left.column_names, + ) + right = DataFrame.from_table( + plc.copying.gather( + right.table, rg, plc.copying.OutOfBoundsPolicy.DONT_CHECK + ), + right.column_names, + ) + right = right.rename_columns( + { + name: f"{name}{suffix}" + for name in right.column_names + if name in left.column_names_set + } + ) + result = left.with_columns(right.columns) + return result.slice(zlice) -@dataclasses.dataclass class Join(IR): """A join of two dataframes.""" - left: IR - """Left frame.""" - right: IR - """Right frame.""" - left_on: list[expr.NamedExpr] + __slots__ = ("left_on", "right_on", "options") + _non_child = ("schema", "left_on", "right_on", "options") + left_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the left frame.""" - right_on: list[expr.NamedExpr] + right_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"], + Literal["inner", "left", "right", "full", "semi", "anti", "cross"], bool, tuple[int, int] | None, - str | None, + str, bool, ] """ @@ -660,9 +1070,21 @@ class Join(IR): - coalesce: should key columns be coalesced (only makes sense for outer joins) """ - def __post_init__(self) -> None: - """Validate preconditions.""" - super().__post_init__() + def __init__( + self, + schema: Schema, + left_on: Sequence[expr.NamedExpr], + right_on: Sequence[expr.NamedExpr], + options: Any, + left: IR, + right: IR, + ): + self.schema = schema + self.left_on = tuple(left_on) + self.right_on = tuple(right_on) + self.options = options + self.children = (left, right) + self._non_child_args = (self.left_on, self.right_on, self.options) if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -672,7 +1094,7 @@ def __post_init__(self) -> None: @staticmethod @cache def _joiners( - how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"], + how: Literal["inner", "left", "right", "full", "semi", "anti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -694,13 +1116,13 @@ def _joiners( plc.copying.OutOfBoundsPolicy.NULLIFY, plc.copying.OutOfBoundsPolicy.NULLIFY, ) - elif how == "leftsemi": + elif how == "semi": return ( plc.join.left_semi_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - elif how == "leftanti": + elif how == "anti": return ( plc.join.left_anti_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, @@ -708,8 +1130,8 @@ def _joiners( ) assert_never(how) + @staticmethod def _reorder_maps( - self, left_rows: int, lg: plc.Column, left_policy: plc.copying.OutOfBoundsPolicy, @@ -761,43 +1183,54 @@ def _reorder_maps( [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], ).columns() - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + left_on_exprs: Sequence[expr.NamedExpr], + right_on_exprs: Sequence[expr.NamedExpr], + options: tuple[ + Literal["inner", "left", "right", "full", "semi", "anti", "cross"], + bool, + tuple[int, int] | None, + str, + bool, + ], + left: DataFrame, + right: DataFrame, + ) -> DataFrame: """Evaluate and return a dataframe.""" - left = self.left.evaluate(cache=cache) - right = self.right.evaluate(cache=cache) - how, join_nulls, zlice, suffix, coalesce = self.options - suffix = "_right" if suffix is None else suffix + how, join_nulls, zlice, suffix, coalesce = options if how == "cross": # Separate implementation, since cross_join returns the # result, not the gather maps columns = plc.join.cross_join(left.table, right.table).columns() left_cols = [ - NamedColumn(new, old.name).sorted_like(old) + Column(new, name=old.name).sorted_like(old) for new, old in zip( columns[: left.num_columns], left.columns, strict=True ) ] right_cols = [ - NamedColumn( + Column( new, - old.name - if old.name not in left.column_names_set - else f"{old.name}{suffix}", + name=name + if name not in left.column_names_set + else f"{name}{suffix}", ) - for new, old in zip( - columns[left.num_columns :], right.columns, strict=True + for new, name in zip( + columns[left.num_columns :], right.column_names, strict=True ) ] - return DataFrame([*left_cols, *right_cols]) + return DataFrame([*left_cols, *right_cols]).slice(zlice) # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184 - left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on))) - right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on))) + left_on = DataFrame(broadcast(*(e.evaluate(left) for e in left_on_exprs))) + right_on = DataFrame(broadcast(*(e.evaluate(right) for e in right_on_exprs))) null_equality = ( plc.types.NullEquality.EQUAL if join_nulls else plc.types.NullEquality.UNEQUAL ) - join_fn, left_policy, right_policy = Join._joiners(how) + join_fn, left_policy, right_policy = cls._joiners(how) if right_policy is None: # Semi join lg = join_fn(left_on.table, right_on.table, null_equality) @@ -811,7 +1244,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: lg, rg = join_fn(left_on.table, right_on.table, null_equality) if how == "left" or how == "right": # Order of left table is preserved - lg, rg = self._reorder_maps( + lg, rg = cls._reorder_maps( left.num_rows, lg, left_policy, right.num_rows, rg, right_policy ) if coalesce and how == "inner": @@ -823,18 +1256,19 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.copying.gather(right.table, rg, right_policy), right.column_names ) if coalesce and how != "inner": - left = left.replace_columns( - *( - NamedColumn( + left = left.with_columns( + ( + Column( plc.replace.replace_nulls(left_col.obj, right_col.obj), - left_col.name, + name=left_col.name, ) for left_col, right_col in zip( left.select_columns(left_on.column_names_set), right.select_columns(right_on.column_names_set), strict=True, ) - ) + ), + replace_only=True, ) right = right.discard_columns(right_on.column_names_set) if how == "right": @@ -851,22 +1285,37 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return result.slice(zlice) -@dataclasses.dataclass class HStack(IR): """Add new columns to a dataframe.""" - df: IR - """Input dataframe.""" - columns: list[expr.NamedExpr] - """List of expressions to produce new columns.""" + __slots__ = ("columns", "should_broadcast") + _non_child = ("schema", "columns", "should_broadcast") should_broadcast: bool - """Should columns be broadcast?""" + """Should the resulting evaluated columns be broadcast to the same length.""" - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + def __init__( + self, + schema: Schema, + columns: Sequence[expr.NamedExpr], + should_broadcast: bool, # noqa: FBT001 + df: IR, + ): + self.schema = schema + self.columns = tuple(columns) + self.should_broadcast = should_broadcast + self._non_child_args = (self.columns, self.should_broadcast) + self.children = (df,) + + @classmethod + def do_evaluate( + cls, + exprs: Sequence[expr.NamedExpr], + should_broadcast: bool, # noqa: FBT001 + df: DataFrame, + ) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) - columns = [c.evaluate(df) for c in self.columns] - if self.should_broadcast: + columns = [c.evaluate(df) for c in exprs] + if should_broadcast: columns = broadcast(*columns, target_length=df.num_rows) else: # Polars ensures this is true, but let's make sure nothing @@ -876,24 +1325,41 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # table that might have mismatching column lengths will # never be turned into a pylibcudf Table with all columns # by the Select, which is why this is safe. - assert all(e.name.startswith("__POLARS_CSER_0x") for e in self.columns) + assert all(e.name.startswith("__POLARS_CSER_0x") for e in exprs) return df.with_columns(columns) -@dataclasses.dataclass class Distinct(IR): """Produce a new dataframe with distinct rows.""" - df: IR - """Input dataframe.""" + __slots__ = ("keep", "subset", "zlice", "stable") + _non_child = ("schema", "keep", "subset", "zlice", "stable") keep: plc.stream_compaction.DuplicateKeepOption - """Which rows to keep.""" - subset: set[str] | None - """Which columns to inspect when computing distinct rows.""" + """Which distinct value to keep.""" + subset: frozenset[str] | None + """Which columns should be used to define distinctness. If None, + then all columns are used.""" zlice: tuple[int, int] | None - """Optional slice to perform after compaction.""" + """Optional slice to apply to the result.""" stable: bool - """Should order be preserved?""" + """Should the result maintain ordering.""" + + def __init__( + self, + schema: Schema, + keep: plc.stream_compaction.DuplicateKeepOption, + subset: frozenset[str] | None, + zlice: tuple[int, int] | None, + stable: bool, # noqa: FBT001 + df: IR, + ): + self.schema = schema + self.keep = keep + self.subset = subset + self.zlice = zlice + self.stable = stable + self._non_child_args = (keep, subset, zlice, stable) + self.children = (df,) _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = { "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, @@ -902,184 +1368,209 @@ class Distinct(IR): "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY, } - def __init__(self, schema: Schema, df: IR, options: Any) -> None: - self.schema = schema - self.df = df - (keep, subset, maintain_order, zlice) = options - self.keep = Distinct._KEEP_MAP[keep] - self.subset = set(subset) if subset is not None else None - self.stable = maintain_order - self.zlice = zlice - - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + keep: plc.stream_compaction.DuplicateKeepOption, + subset: frozenset[str] | None, + zlice: tuple[int, int] | None, + stable: bool, # noqa: FBT001 + df: DataFrame, + ): """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) - if self.subset is None: + if subset is None: indices = list(range(df.num_columns)) + keys_sorted = all(c.is_sorted for c in df.column_map.values()) else: - indices = [i for i, k in enumerate(df.column_names) if k in self.subset] - keys_sorted = all(df.columns[i].is_sorted for i in indices) + indices = [i for i, k in enumerate(df.column_names) if k in subset] + keys_sorted = all(df.column_map[name].is_sorted for name in subset) if keys_sorted: table = plc.stream_compaction.unique( df.table, indices, - self.keep, + keep, plc.types.NullEquality.EQUAL, ) else: distinct = ( plc.stream_compaction.stable_distinct - if self.stable + if stable else plc.stream_compaction.distinct ) table = distinct( df.table, indices, - self.keep, + keep, plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) + # TODO: Is this sortedness setting correct result = DataFrame( [ - NamedColumn(c, old.name).sorted_like(old) - for c, old in zip(table.columns(), df.columns, strict=True) + Column(new, name=old.name).sorted_like(old) + for new, old in zip(table.columns(), df.columns, strict=True) ] ) - if keys_sorted or self.stable: + if keys_sorted or stable: result = result.sorted_like(df) - return result.slice(self.zlice) + return result.slice(zlice) -@dataclasses.dataclass class Sort(IR): """Sort a dataframe.""" - df: IR - """Input.""" - by: list[expr.NamedExpr] - """List of expressions to produce sort keys.""" - do_sort: Callable[..., plc.Table] - """pylibcudf sorting function.""" + __slots__ = ("by", "order", "null_order", "stable", "zlice") + _non_child = ("schema", "by", "order", "null_order", "stable", "zlice") + by: tuple[expr.NamedExpr, ...] + """Sort keys.""" + order: tuple[plc.types.Order, ...] + """Sort order for each sort key.""" + null_order: tuple[plc.types.NullOrder, ...] + """Null sorting location for each sort key.""" + stable: bool + """Should the sort be stable?""" zlice: tuple[int, int] | None - """Optional slice to apply after sorting.""" - order: list[plc.types.Order] - """Order keys should be sorted in.""" - null_order: list[plc.types.NullOrder] - """Where nulls sort to.""" + """Optional slice to apply to the result.""" def __init__( self, schema: Schema, - df: IR, - by: list[expr.NamedExpr], - options: Any, + by: Sequence[expr.NamedExpr], + order: Sequence[plc.types.Order], + null_order: Sequence[plc.types.NullOrder], + stable: bool, # noqa: FBT001 zlice: tuple[int, int] | None, - ) -> None: + df: IR, + ): self.schema = schema - self.df = df - self.by = by + self.by = tuple(by) + self.order = tuple(order) + self.null_order = tuple(null_order) + self.stable = stable self.zlice = zlice - stable, nulls_last, descending = options - self.order, self.null_order = sorting.sort_order( - descending, nulls_last=nulls_last, num_keys=len(by) - ) - self.do_sort = ( - plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + self._non_child_args = ( + self.by, + self.order, + self.null_order, + self.stable, + self.zlice, ) - - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + self.children = (df,) + + @classmethod + def do_evaluate( + cls, + by: Sequence[expr.NamedExpr], + order: Sequence[plc.types.Order], + null_order: Sequence[plc.types.NullOrder], + stable: bool, # noqa: FBT001 + zlice: tuple[int, int] | None, + df: DataFrame, + ) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) - sort_keys = broadcast( - *(k.evaluate(df) for k in self.by), target_length=df.num_rows - ) - names = {c.name: i for i, c in enumerate(df.columns)} + sort_keys = broadcast(*(k.evaluate(df) for k in by), target_length=df.num_rows) # TODO: More robust identification here. - keys_in_result = [ - i - for k in sort_keys - if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj - ] - table = self.do_sort( + keys_in_result = { + k.name: i + for i, k in enumerate(sort_keys) + if k.name in df.column_map and k.obj is df.column_map[k.name].obj + } + do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + table = do_sort( df.table, plc.Table([k.obj for k in sort_keys]), - self.order, - self.null_order, + list(order), + list(null_order), ) - columns = [ - NamedColumn(c, old.name) - for c, old in zip(table.columns(), df.columns, strict=True) - ] - # If a sort key is in the result table, set the sortedness property - for k, i in enumerate(keys_in_result): - columns[i] = columns[i].set_sorted( - is_sorted=plc.types.Sorted.YES, - order=self.order[k], - null_order=self.null_order[k], - ) - return DataFrame(columns).slice(self.zlice) + columns: list[Column] = [] + for name, c in zip(df.column_map, table.columns(), strict=True): + column = Column(c, name=name) + # If a sort key is in the result table, set the sortedness property + if name in keys_in_result: + i = keys_in_result[name] + column = column.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=order[i], + null_order=null_order[i], + ) + columns.append(column) + return DataFrame(columns).slice(zlice) -@dataclasses.dataclass class Slice(IR): """Slice a dataframe.""" - df: IR - """Input.""" + __slots__ = ("offset", "length") + _non_child = ("schema", "offset", "length") offset: int """Start of the slice.""" length: int """Length of the slice.""" - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + def __init__(self, schema: Schema, offset: int, length: int, df: IR): + self.schema = schema + self.offset = offset + self.length = length + self._non_child_args = (offset, length) + self.children = (df,) + + @classmethod + def do_evaluate(cls, offset: int, length: int, df: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) - return df.slice((self.offset, self.length)) + return df.slice((offset, length)) -@dataclasses.dataclass class Filter(IR): """Filter a dataframe with a boolean mask.""" - df: IR - """Input.""" + __slots__ = ("mask",) + _non_child = ("schema", "mask") mask: expr.NamedExpr - """Expression evaluating to a mask.""" + """Expression to produce the filter mask.""" - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR): + self.schema = schema + self.mask = mask + self._non_child_args = (mask,) + self.children = (df,) + + @classmethod + def do_evaluate(cls, mask_expr: expr.NamedExpr, df: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) - (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows) + (mask,) = broadcast(mask_expr.evaluate(df), target_length=df.num_rows) return df.filter(mask) -@dataclasses.dataclass class Projection(IR): """Select a subset of columns from a dataframe.""" - df: IR - """Input.""" + __slots__ = () + _non_child = ("schema",) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + def __init__(self, schema: Schema, df: IR): + self.schema = schema + self._non_child_args = (schema,) + self.children = (df,) + + @classmethod + def do_evaluate(cls, schema: Schema, df: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) # This can reorder things. columns = broadcast( - *df.select(list(self.schema.keys())).columns, target_length=df.num_rows + *(df.column_map[name] for name in schema), target_length=df.num_rows ) return DataFrame(columns) -@dataclasses.dataclass class MapFunction(IR): """Apply some function to a dataframe.""" - df: IR - """Input.""" + __slots__ = ("name", "options") + _non_child = ("schema", "name", "options") name: str - """Function name.""" + """Name of the function to apply""" options: Any - """Arbitrary options, interpreted per function.""" + """Arbitrary name-specific options""" _NAMES: ClassVar[frozenset[str]] = frozenset( [ @@ -1094,9 +1585,11 @@ class MapFunction(IR): ] ) - def __post_init__(self) -> None: - """Validate preconditions.""" - super().__post_init__() + def __init__(self, schema: Schema, name: str, options: Any, df: IR): + self.schema = schema + self.name = name + self.options = options + self.children = (df,) if self.name not in MapFunction._NAMES: raise NotImplementedError(f"Unhandled map function {self.name}") if self.name == "explode": @@ -1110,7 +1603,7 @@ def __post_init__(self) -> None: old, new, _ = self.options # TODO: perhaps polars should validate renaming in the IR? if len(new) != len(set(new)) or ( - set(new) & (set(self.df.schema.keys() - set(old))) + set(new) & (set(df.schema.keys()) - set(old)) ): raise NotImplementedError("Duplicate new names in rename.") elif self.name == "unpivot": @@ -1119,43 +1612,51 @@ def __post_init__(self) -> None: variable_name = "variable" if variable_name is None else variable_name if len(pivotees) == 0: index = frozenset(indices) - pivotees = [name for name in self.df.schema if name not in index] + pivotees = [name for name in df.schema if name not in index] if not all( - dtypes.can_cast(self.df.schema[p], self.schema[value_name]) - for p in pivotees + dtypes.can_cast(df.schema[p], self.schema[value_name]) for p in pivotees ): raise NotImplementedError( "Unpivot cannot cast all input columns to " f"{self.schema[value_name].id()}" - ) - self.options = (indices, pivotees, variable_name, value_name) + ) # pragma: no cover + self.options = ( + tuple(indices), + tuple(pivotees), + (variable_name, schema[variable_name]), + (value_name, schema[value_name]), + ) + self._non_child_args = (name, self.options) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - if self.name == "rechunk": + if name == "rechunk": # No-op in our data model # Don't think this appears in a plan tree from python - return self.df.evaluate(cache=cache) # pragma: no cover - elif self.name == "rename": - df = self.df.evaluate(cache=cache) + return df # pragma: no cover + elif name == "rename": # final tag is "swapping" which is useful for the # optimiser (it blocks some pushdown operations) - old, new, _ = self.options + old, new, _ = options return df.rename_columns(dict(zip(old, new, strict=True))) - elif self.name == "explode": - df = self.df.evaluate(cache=cache) - ((to_explode,),) = self.options + elif name == "explode": + ((to_explode,),) = options index = df.column_names.index(to_explode) subset = df.column_names_set - {to_explode} return DataFrame.from_table( plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) - elif self.name == "unpivot": - indices, pivotees, variable_name, value_name = self.options + elif name == "unpivot": + ( + indices, + pivotees, + (variable_name, variable_dtype), + (value_name, value_dtype), + ) = options npiv = len(pivotees) - df = self.df.evaluate(cache=cache) index_columns = [ - NamedColumn(col, name) + Column(col, name=name) for col, name in zip( plc.reshape.tile(df.select(indices).table, npiv).columns(), indices, @@ -1168,7 +1669,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.interop.from_arrow( pa.array( pivotees, - type=plc.interop.to_arrow(self.schema[variable_name]), + type=plc.interop.to_arrow(variable_dtype), ), ) ] @@ -1176,50 +1677,56 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df.num_rows, ).columns() value_column = plc.concatenate.concatenate( - [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns] + [df.column_map[pivotee].astype(value_dtype).obj for pivotee in pivotees] ) return DataFrame( [ *index_columns, - NamedColumn(variable_column, variable_name), - NamedColumn(value_column, value_name), + Column(variable_column, name=variable_name), + Column(value_column, name=value_name), ] ) else: raise AssertionError("Should never be reached") # pragma: no cover -@dataclasses.dataclass class Union(IR): """Concatenate dataframes vertically.""" - dfs: list[IR] - """List of inputs.""" + __slots__ = ("zlice",) + _non_child = ("schema", "zlice") zlice: tuple[int, int] | None - """Optional slice to apply after concatenation.""" + """Optional slice to apply to the result.""" - def __post_init__(self) -> None: - """Validate preconditions.""" - super().__post_init__() - schema = self.dfs[0].schema - if not all(s.schema == schema for s in self.dfs[1:]): + def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR): + self.schema = schema + self.zlice = zlice + self._non_child_args = (zlice,) + self.children = children + schema = self.children[0].schema + if not all(s.schema == schema for s in self.children[1:]): raise NotImplementedError("Schema mismatch") - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - # TODO: only evaluate what we need if we have a slice - dfs = [df.evaluate(cache=cache) for df in self.dfs] + # TODO: only evaluate what we need if we have a slice? return DataFrame.from_table( - plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names - ).slice(self.zlice) + plc.concatenate.concatenate([df.table for df in dfs]), + dfs[0].column_names, + ).slice(zlice) -@dataclasses.dataclass class HConcat(IR): """Concatenate dataframes horizontally.""" - dfs: list[IR] - """List of inputs.""" + __slots__ = () + _non_child = ("schema",) + + def __init__(self, schema: Schema, *children: IR): + self.schema = schema + self._non_child_args = () + self.children = children @staticmethod def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table: @@ -1249,20 +1756,22 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table: ] ) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, *dfs: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - dfs = [df.evaluate(cache=cache) for df in self.dfs] max_rows = max(df.num_rows for df in dfs) # Horizontal concatenation extends shorter tables with nulls - dfs = [ - df - if df.num_rows == max_rows - else DataFrame.from_table( - self._extend_with_nulls(df.table, nrows=max_rows - df.num_rows), - df.column_names, - ) - for df in dfs - ] return DataFrame( - list(itertools.chain.from_iterable(df.columns for df in dfs)), + itertools.chain.from_iterable( + df.columns + for df in ( + df + if df.num_rows == max_rows + else DataFrame.from_table( + cls._extend_with_nulls(df.table, nrows=max_rows - df.num_rows), + df.column_names, + ) + for df in dfs + ) + ) ) diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py new file mode 100644 index 00000000000..dd5c40a00be --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Base class for IR nodes, and utilities.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar + +if TYPE_CHECKING: + from collections.abc import Hashable, Sequence + + from typing_extensions import Self + + +__all__: list[str] = ["Node"] + +T = TypeVar("T", bound="Node[Any]") + + +class Node(Generic[T]): + """ + An abstract node type. + + Nodes are immutable! + + This contains a (potentially empty) tuple of child nodes, + along with non-child data. For uniform reconstruction and + implementation of hashing and equality schemes, child classes need + to provide a certain amount of metadata when they are defined. + Specifically, the ``_non_child`` attribute must list, in-order, + the names of the slots that are passed to the constructor. The + constructor must take arguments in the order ``(*_non_child, + *children).`` + """ + + __slots__ = ("_hash_value", "_repr_value", "children") + _hash_value: int + _repr_value: str + children: tuple[T, ...] + _non_child: ClassVar[tuple[str, ...]] = () + + def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]: + return (*(getattr(self, attr) for attr in self._non_child), *children) + + def reconstruct(self, children: Sequence[T]) -> Self: + """ + Rebuild this node with new children. + + Parameters + ---------- + children + New children + + Returns + ------- + New node with new children. Non-child data is shared with the input. + """ + return type(self)(*self._ctor_arguments(children)) + + def get_hashable(self) -> Hashable: + """ + Return a hashable object for the node. + + Returns + ------- + Hashable object. + + Notes + ----- + This method is used by the :meth:`__hash__` implementation + (which does caching). If your node type needs special-case + handling for some of its attributes, override this method, not + :meth:`__hash__`. + """ + return (type(self), self._ctor_arguments(self.children)) + + def __hash__(self) -> int: + """ + Hash of an expression with caching. + + See Also + -------- + get_hashable + """ + try: + return self._hash_value + except AttributeError: + self._hash_value = hash(self.get_hashable()) + return self._hash_value + + def is_equal(self, other: Self) -> bool: + """ + Equality of two nodes of equal type. + + Override this in subclasses, rather than :meth:`__eq__`. + + Parameter + --------- + other + object of same type to compare to. + + Notes + ----- + Since nodes are immutable, this does common subexpression + elimination when two nodes are determined to be equal. + + :meth:`__eq__` handles the case where the objects being + compared are not of the same type, so in this method, we only + need to implement equality of equal types. + + Returns + ------- + True if the two nodes are equal, false otherwise. + """ + if self is other: + return True + result = self._ctor_arguments(self.children) == other._ctor_arguments( + other.children + ) + # Eager CSE for nodes that match. + if result: + self.children = other.children + return result + + def __eq__(self, other: Any) -> bool: + """ + Equality of expressions. + + See Also + -------- + is_equal + """ + if type(self) is not type(other) or hash(self) != hash(other): + return False + else: + return self.is_equal(other) + + def __ne__(self, other: Any) -> bool: + """Inequality of expressions.""" + return not self.__eq__(other) + + def __repr__(self) -> str: + """String representation of an expression with caching.""" + try: + return self._repr_value + except AttributeError: + args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children)) + self._repr_value = f"{type(self).__name__}({args})" + return self._repr_value diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py new file mode 100644 index 00000000000..acc4b3669af --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py @@ -0,0 +1,320 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Conversion of expression nodes to libcudf AST nodes.""" + +from __future__ import annotations + +from functools import partial, reduce, singledispatch +from typing import TYPE_CHECKING, TypeAlias + +from polars.polars import _expr_nodes as pl_expr + +import pylibcudf as plc +from pylibcudf import expressions as plc_expr + +from cudf_polars.dsl import expr +from cudf_polars.dsl.traversal import CachingVisitor, reuse_if_unchanged +from cudf_polars.typing import GenericTransformer + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.typing import ExprTransformer + +# Can't merge these op-mapping dictionaries because scoped enum values +# are exposed by cython with equality/hash based one their underlying +# representation type. So in a dict they are just treated as integers. +BINOP_TO_ASTOP = { + plc.binaryop.BinaryOperator.EQUAL: plc_expr.ASTOperator.EQUAL, + plc.binaryop.BinaryOperator.NULL_EQUALS: plc_expr.ASTOperator.NULL_EQUAL, + plc.binaryop.BinaryOperator.NOT_EQUAL: plc_expr.ASTOperator.NOT_EQUAL, + plc.binaryop.BinaryOperator.LESS: plc_expr.ASTOperator.LESS, + plc.binaryop.BinaryOperator.LESS_EQUAL: plc_expr.ASTOperator.LESS_EQUAL, + plc.binaryop.BinaryOperator.GREATER: plc_expr.ASTOperator.GREATER, + plc.binaryop.BinaryOperator.GREATER_EQUAL: plc_expr.ASTOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.ADD: plc_expr.ASTOperator.ADD, + plc.binaryop.BinaryOperator.SUB: plc_expr.ASTOperator.SUB, + plc.binaryop.BinaryOperator.MUL: plc_expr.ASTOperator.MUL, + plc.binaryop.BinaryOperator.DIV: plc_expr.ASTOperator.DIV, + plc.binaryop.BinaryOperator.TRUE_DIV: plc_expr.ASTOperator.TRUE_DIV, + plc.binaryop.BinaryOperator.FLOOR_DIV: plc_expr.ASTOperator.FLOOR_DIV, + plc.binaryop.BinaryOperator.PYMOD: plc_expr.ASTOperator.PYMOD, + plc.binaryop.BinaryOperator.BITWISE_AND: plc_expr.ASTOperator.BITWISE_AND, + plc.binaryop.BinaryOperator.BITWISE_OR: plc_expr.ASTOperator.BITWISE_OR, + plc.binaryop.BinaryOperator.BITWISE_XOR: plc_expr.ASTOperator.BITWISE_XOR, + plc.binaryop.BinaryOperator.LOGICAL_AND: plc_expr.ASTOperator.LOGICAL_AND, + plc.binaryop.BinaryOperator.LOGICAL_OR: plc_expr.ASTOperator.LOGICAL_OR, + plc.binaryop.BinaryOperator.NULL_LOGICAL_AND: plc_expr.ASTOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.NULL_LOGICAL_OR: plc_expr.ASTOperator.NULL_LOGICAL_OR, +} + +UOP_TO_ASTOP = { + plc.unary.UnaryOperator.SIN: plc_expr.ASTOperator.SIN, + plc.unary.UnaryOperator.COS: plc_expr.ASTOperator.COS, + plc.unary.UnaryOperator.TAN: plc_expr.ASTOperator.TAN, + plc.unary.UnaryOperator.ARCSIN: plc_expr.ASTOperator.ARCSIN, + plc.unary.UnaryOperator.ARCCOS: plc_expr.ASTOperator.ARCCOS, + plc.unary.UnaryOperator.ARCTAN: plc_expr.ASTOperator.ARCTAN, + plc.unary.UnaryOperator.SINH: plc_expr.ASTOperator.SINH, + plc.unary.UnaryOperator.COSH: plc_expr.ASTOperator.COSH, + plc.unary.UnaryOperator.TANH: plc_expr.ASTOperator.TANH, + plc.unary.UnaryOperator.ARCSINH: plc_expr.ASTOperator.ARCSINH, + plc.unary.UnaryOperator.ARCCOSH: plc_expr.ASTOperator.ARCCOSH, + plc.unary.UnaryOperator.ARCTANH: plc_expr.ASTOperator.ARCTANH, + plc.unary.UnaryOperator.EXP: plc_expr.ASTOperator.EXP, + plc.unary.UnaryOperator.LOG: plc_expr.ASTOperator.LOG, + plc.unary.UnaryOperator.SQRT: plc_expr.ASTOperator.SQRT, + plc.unary.UnaryOperator.CBRT: plc_expr.ASTOperator.CBRT, + plc.unary.UnaryOperator.CEIL: plc_expr.ASTOperator.CEIL, + plc.unary.UnaryOperator.FLOOR: plc_expr.ASTOperator.FLOOR, + plc.unary.UnaryOperator.ABS: plc_expr.ASTOperator.ABS, + plc.unary.UnaryOperator.RINT: plc_expr.ASTOperator.RINT, + plc.unary.UnaryOperator.BIT_INVERT: plc_expr.ASTOperator.BIT_INVERT, + plc.unary.UnaryOperator.NOT: plc_expr.ASTOperator.NOT, +} + +SUPPORTED_STATISTICS_BINOPS = { + plc.binaryop.BinaryOperator.EQUAL, + plc.binaryop.BinaryOperator.NOT_EQUAL, + plc.binaryop.BinaryOperator.LESS, + plc.binaryop.BinaryOperator.LESS_EQUAL, + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.GREATER_EQUAL, +} + +REVERSED_COMPARISON = { + plc.binaryop.BinaryOperator.EQUAL: plc.binaryop.BinaryOperator.EQUAL, + plc.binaryop.BinaryOperator.NOT_EQUAL: plc.binaryop.BinaryOperator.NOT_EQUAL, + plc.binaryop.BinaryOperator.LESS: plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS_EQUAL: plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.GREATER: plc.binaryop.BinaryOperator.LESS, + plc.binaryop.BinaryOperator.GREATER_EQUAL: plc.binaryop.BinaryOperator.LESS_EQUAL, +} + + +Transformer: TypeAlias = GenericTransformer[expr.Expr, plc_expr.Expression] + + +@singledispatch +def _to_ast(node: expr.Expr, self: Transformer) -> plc_expr.Expression: + """ + Translate an expression to a pylibcudf Expression. + + Parameters + ---------- + node + Expression to translate. + self + Recursive transformer. The state dictionary should contain a + `for_parquet` key indicating if this transformation should + provide an expression suitable for use in parquet filters. + + If `for_parquet` is `False`, the dictionary should contain a + `name_to_index` mapping that maps column names to their + integer index in the table that will be used for evaluation of + the expression. + + Returns + ------- + pylibcudf Expression. + + Raises + ------ + NotImplementedError or KeyError if the expression cannot be translated. + """ + raise NotImplementedError(f"Unhandled expression type {type(node)}") + + +@_to_ast.register +def _(node: expr.Col, self: Transformer) -> plc_expr.Expression: + if self.state["for_parquet"]: + return plc_expr.ColumnNameReference(node.name) + raise TypeError("Should always be wrapped in a ColRef node before translation") + + +@_to_ast.register +def _(node: expr.ColRef, self: Transformer) -> plc_expr.Expression: + if self.state["for_parquet"]: + raise TypeError("Not expecting ColRef node in parquet filter") + return plc_expr.ColumnReference(node.index, node.table_ref) + + +@_to_ast.register +def _(node: expr.Literal, self: Transformer) -> plc_expr.Expression: + return plc_expr.Literal(plc.interop.from_arrow(node.value)) + + +@_to_ast.register +def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression: + if node.op == plc.binaryop.BinaryOperator.NULL_NOT_EQUALS: + return plc_expr.Operation( + plc_expr.ASTOperator.NOT, + self( + # Reconstruct and apply, rather than directly + # constructing the right expression so we get the + # handling of parquet special cases for free. + expr.BinOp( + node.dtype, plc.binaryop.BinaryOperator.NULL_EQUALS, *node.children + ) + ), + ) + if self.state["for_parquet"]: + op1_col, op2_col = (isinstance(op, expr.Col) for op in node.children) + if op1_col ^ op2_col: + op = node.op + if op not in SUPPORTED_STATISTICS_BINOPS: + raise NotImplementedError( + f"Parquet filter binop with column doesn't support {node.op!r}" + ) + op1, op2 = node.children + if op2_col: + (op1, op2) = (op2, op1) + op = REVERSED_COMPARISON[op] + if not isinstance(op2, expr.Literal): + raise NotImplementedError( + "Parquet filter binops must have form 'col binop literal'" + ) + return plc_expr.Operation(BINOP_TO_ASTOP[op], self(op1), self(op2)) + elif op1_col and op2_col: + raise NotImplementedError( + "Parquet filter binops must have one column reference not two" + ) + return plc_expr.Operation(BINOP_TO_ASTOP[node.op], *map(self, node.children)) + + +@_to_ast.register +def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression: + if node.name == pl_expr.BooleanFunction.IsIn: + needles, haystack = node.children + if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16: + # 16 is an arbitrary limit + needle_ref = self(needles) + values = [ + plc_expr.Literal(plc.interop.from_arrow(v)) for v in haystack.value + ] + return reduce( + partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_OR), + ( + plc_expr.Operation(plc_expr.ASTOperator.EQUAL, needle_ref, value) + for value in values + ), + ) + if self.state["for_parquet"] and isinstance(node.children[0], expr.Col): + raise NotImplementedError( + f"Parquet filters don't support {node.name} on columns" + ) + if node.name == pl_expr.BooleanFunction.IsNull: + return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])) + elif node.name == pl_expr.BooleanFunction.IsNotNull: + return plc_expr.Operation( + plc_expr.ASTOperator.NOT, + plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])), + ) + elif node.name == pl_expr.BooleanFunction.Not: + return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0])) + raise NotImplementedError(f"AST conversion does not support {node.name}") + + +@_to_ast.register +def _(node: expr.UnaryFunction, self: Transformer) -> plc_expr.Expression: + if isinstance(node.children[0], expr.Col) and self.state["for_parquet"]: + raise NotImplementedError( + "Parquet filters don't support {node.name} on columns" + ) + return plc_expr.Operation( + UOP_TO_ASTOP[node._OP_MAPPING[node.name]], self(node.children[0]) + ) + + +def to_parquet_filter(node: expr.Expr) -> plc_expr.Expression | None: + """ + Convert an expression to libcudf AST nodes suitable for parquet filtering. + + Parameters + ---------- + node + Expression to convert. + + Returns + ------- + pylibcudf Expression if conversion is possible, otherwise None. + """ + mapper = CachingVisitor(_to_ast, state={"for_parquet": True}) + try: + return mapper(node) + except (KeyError, NotImplementedError): + return None + + +def to_ast(node: expr.Expr) -> plc_expr.Expression | None: + """ + Convert an expression to libcudf AST nodes suitable for compute_column. + + Parameters + ---------- + node + Expression to convert. + + Notes + ----- + `Col` nodes must always be wrapped in `TableRef` nodes when + converting to an ast expression so that their table reference and + index are provided. + + Returns + ------- + pylibcudf Expression if conversion is possible, otherwise None. + """ + mapper = CachingVisitor(_to_ast, state={"for_parquet": False}) + try: + return mapper(node) + except (KeyError, NotImplementedError): + return None + + +def _insert_colrefs(node: expr.Expr, rec: ExprTransformer) -> expr.Expr: + if isinstance(node, expr.Col): + return expr.ColRef( + node.dtype, + rec.state["name_to_index"][node.name], + rec.state["table_ref"], + node, + ) + return reuse_if_unchanged(node, rec) + + +def insert_colrefs( + node: expr.Expr, + *, + table_ref: plc.expressions.TableReference, + name_to_index: Mapping[str, int], +) -> expr.Expr: + """ + Insert column references into an expression before conversion to libcudf AST. + + Parameters + ---------- + node + Expression to insert references into. + table_ref + pylibcudf `TableReference` indicating whether column + references are coming from the left or right table. + name_to_index: + Mapping from column names to column indices in the table + eventually used for evaluation. + + Notes + ----- + All column references are wrapped in the same, singular, table + reference, so this function relies on the expression only + containing column references from a single table. + + Returns + ------- + New expression with column references inserted. + """ + mapper = CachingVisitor( + _insert_colrefs, state={"table_ref": table_ref, "name_to_index": name_to_index} + ) + return mapper(node) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index a0291037f01..12fc2a196cd 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -5,24 +5,148 @@ from __future__ import annotations +import functools import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch -from typing import Any +from typing import TYPE_CHECKING, Any import pyarrow as pa -import pylibcudf as plc from typing_extensions import assert_never import polars as pl import polars.polars as plrs from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir +import pylibcudf as plc + from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.to_ast import insert_colrefs from cudf_polars.typing import NodeTraverser -from cudf_polars.utils import dtypes +from cudf_polars.utils import dtypes, sorting + +if TYPE_CHECKING: + from polars import GPUEngine + + from cudf_polars.typing import NodeTraverser + +__all__ = ["Translator", "translate_named_expr"] + + +class Translator: + """ + Translates polars-internal IR nodes and expressions to our representation. -__all__ = ["translate_ir", "translate_named_expr"] + Parameters + ---------- + visitor + Polars NodeTraverser object + config + GPU engine configuration. + """ + + def __init__(self, visitor: NodeTraverser, config: GPUEngine): + self.visitor = visitor + self.config = config + self.errors: list[Exception] = [] + + def translate_ir(self, *, n: int | None = None) -> ir.IR: + """ + Translate a polars-internal IR node to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Optional node to start traversing from, if not provided uses + current polars-internal node. + + Returns + ------- + Translated IR object + + Raises + ------ + NotImplementedError + If the version of Polars IR is unsupported. + + Notes + ----- + Any expression nodes that cannot be translated are replaced by + :class:`expr.ErrorNode` nodes and collected in the the `errors` attribute. + After translation is complete, this list of errors should be inspected + to determine if the query is supported. + """ + ctx: AbstractContextManager[None] = ( + set_node(self.visitor, n) if n is not None else noop_context + ) + # IR is versioned with major.minor, minor is bumped for backwards + # compatible changes (e.g. adding new nodes), major is bumped for + # incompatible changes (e.g. renaming nodes). + if (version := self.visitor.version()) >= (4, 0): + e = NotImplementedError( + f"No support for polars IR {version=}" + ) # pragma: no cover; no such version for now. + self.errors.append(e) # pragma: no cover + raise e # pragma: no cover + + with ctx: + polars_schema = self.visitor.get_schema() + try: + schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} + except Exception as e: + self.errors.append(NotImplementedError(str(e))) + return ir.ErrorNode({}, str(e)) + try: + node = self.visitor.view_current_node() + except Exception as e: + self.errors.append(e) + return ir.ErrorNode(schema, str(e)) + try: + result = _translate_ir(node, self, schema) + except Exception as e: + self.errors.append(e) + return ir.ErrorNode(schema, str(e)) + if any( + isinstance(dtype, pl.Null) + for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) + ): + error = NotImplementedError( + f"No GPU support for {result} with Null column dtype." + ) + self.errors.append(error) + return ir.ErrorNode(schema, str(error)) + + return result + + def translate_expr(self, *, n: int) -> expr.Expr: + """ + Translate a polars-internal expression IR into our representation. + + Parameters + ---------- + n + Node to translate, an integer referencing a polars internal node. + + Returns + ------- + Translated IR object. + + Notes + ----- + Any expression nodes that cannot be translated are replaced by + :class:`expr.ErrorExpr` nodes and collected in the the `errors` attribute. + After translation is complete, this list of errors should be inspected + to determine if the query is supported. + """ + node = self.visitor.view_expression(n) + dtype = dtypes.from_polars(self.visitor.get_dtype(n)) + try: + return _translate_expr(node, self, dtype) + except Exception as e: + self.errors.append(e) + return expr.ErrorExpr(dtype, str(e)) class set_node(AbstractContextManager[None]): @@ -64,7 +188,7 @@ def __exit__(self, *args: Any) -> None: @singledispatch def _translate_ir( - node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: Any, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: raise NotImplementedError( f"Translation for {type(node).__name__}" @@ -73,19 +197,19 @@ def _translate_ir( @_translate_ir.register def _( - node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.PythonScan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: scan_fn, with_columns, source_type, predicate, nrows = node.options options = (scan_fn, with_columns, source_type, nrows) predicate = ( - translate_named_expr(visitor, n=predicate) if predicate is not None else None + translate_named_expr(translator, n=predicate) if predicate is not None else None ) return ir.PythonScan(schema, options, predicate) @_translate_ir.register def _( - node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Scan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: typ, *options = node.scan_type if typ == "ndjson": @@ -109,12 +233,13 @@ def _( typ, reader_options, cloud_options, + translator.config.config.copy(), node.paths, with_columns, skip_rows, n_rows, row_index, - translate_named_expr(visitor, n=node.predicate) + translate_named_expr(translator, n=node.predicate) if node.predicate is not None else None, ) @@ -122,20 +247,20 @@ def _( @_translate_ir.register def _( - node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Cache, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) + return ir.Cache(schema, node.id_, translator.translate_ir(n=node.input)) @_translate_ir.register def _( - node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.DataFrameScan, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.DataFrameScan( schema, node.df, node.projection, - translate_named_expr(visitor, n=node.selection) + translate_named_expr(translator, n=node.selection) if node.selection is not None else None, ) @@ -143,203 +268,216 @@ def _( @_translate_ir.register def _( - node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Select, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.expr] - return ir.Select(schema, inp, exprs, node.should_broadcast) + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.expr] + return ir.Select(schema, exprs, node.should_broadcast, inp) @_translate_ir.register def _( - node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.GroupBy, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - aggs = [translate_named_expr(visitor, n=e) for e in node.aggs] - keys = [translate_named_expr(visitor, n=e) for e in node.keys] + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + aggs = [translate_named_expr(translator, n=e) for e in node.aggs] + keys = [translate_named_expr(translator, n=e) for e in node.keys] return ir.GroupBy( schema, - inp, - aggs, keys, + aggs, node.maintain_order, node.options, + inp, ) @_translate_ir.register def _( - node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Join, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. - with set_node(visitor, node.input_left): - inp_left = translate_ir(visitor, n=None) - left_on = [translate_named_expr(visitor, n=e) for e in node.left_on] - with set_node(visitor, node.input_right): - inp_right = translate_ir(visitor, n=None) - right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] - return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) + with set_node(translator.visitor, node.input_left): + inp_left = translator.translate_ir(n=None) + left_on = [translate_named_expr(translator, n=e) for e in node.left_on] + with set_node(translator.visitor, node.input_right): + inp_right = translator.translate_ir(n=None) + right_on = [translate_named_expr(translator, n=e) for e in node.right_on] + if (how := node.options[0]) in { + "inner", + "left", + "right", + "full", + "cross", + "semi", + "anti", + }: + return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right) + else: + how, op1, op2 = how + if how != "ie_join": + raise NotImplementedError( + f"Unsupported join type {how}" + ) # pragma: no cover; asof joins not yet exposed + if op2 is None: + ops = [op1] + else: + ops = [op1, op2] + + dtype = plc.DataType(plc.TypeId.BOOL8) + predicate = functools.reduce( + functools.partial( + expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND + ), + ( + expr.BinOp( + dtype, + expr.BinOp._MAPPING[op], + insert_colrefs( + left.value, + table_ref=plc.expressions.TableReference.LEFT, + name_to_index={ + name: i for i, name in enumerate(inp_left.schema) + }, + ), + insert_colrefs( + right.value, + table_ref=plc.expressions.TableReference.RIGHT, + name_to_index={ + name: i for i, name in enumerate(inp_right.schema) + }, + ), + ) + for op, left, right in zip(ops, left_on, right_on, strict=True) + ), + ) + + return ir.ConditionalJoin(schema, predicate, node.options, inp_left, inp_right) @_translate_ir.register def _( - node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.HStack, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] - return ir.HStack(schema, inp, exprs, node.should_broadcast) + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.exprs] + return ir.HStack(schema, exprs, node.should_broadcast, inp) @_translate_ir.register def _( - node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Reduce, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: # pragma: no cover; polars doesn't emit this node yet - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_named_expr(visitor, n=e) for e in node.expr] - return ir.Reduce(schema, inp, exprs) + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + exprs = [translate_named_expr(translator, n=e) for e in node.expr] + return ir.Reduce(schema, exprs, inp) @_translate_ir.register def _( - node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Distinct, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: + (keep, subset, maintain_order, zlice) = node.options + keep = ir.Distinct._KEEP_MAP[keep] + subset = frozenset(subset) if subset is not None else None return ir.Distinct( schema, - translate_ir(visitor, n=node.input), - node.options, + keep, + subset, + zlice, + maintain_order, + translator.translate_ir(n=node.input), ) @_translate_ir.register def _( - node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Sort, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - by = [translate_named_expr(visitor, n=e) for e in node.by_column] - return ir.Sort(schema, inp, by, node.sort_options, node.slice) + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + by = [translate_named_expr(translator, n=e) for e in node.by_column] + stable, nulls_last, descending = node.sort_options + order, null_order = sorting.sort_order( + descending, nulls_last=nulls_last, num_keys=len(by) + ) + return ir.Sort(schema, by, order, null_order, stable, node.slice, inp) @_translate_ir.register def _( - node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Slice, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len) + return ir.Slice( + schema, node.offset, node.len, translator.translate_ir(n=node.input) + ) @_translate_ir.register def _( - node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Filter, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - mask = translate_named_expr(visitor, n=node.predicate) - return ir.Filter(schema, inp, mask) + with set_node(translator.visitor, node.input): + inp = translator.translate_ir(n=None) + mask = translate_named_expr(translator, n=node.predicate) + return ir.Filter(schema, mask, inp) @_translate_ir.register def _( node: pl_ir.SimpleProjection, - visitor: NodeTraverser, + translator: Translator, schema: dict[str, plc.DataType], ) -> ir.IR: - return ir.Projection(schema, translate_ir(visitor, n=node.input)) + return ir.Projection(schema, translator.translate_ir(n=node.input)) @_translate_ir.register def _( - node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: name, *options = node.function return ir.MapFunction( schema, - # TODO: merge_sorted breaks this pattern - translate_ir(visitor, n=node.input), name, options, + # TODO: merge_sorted breaks this pattern + translator.translate_ir(n=node.input), ) @_translate_ir.register def _( - node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.Union, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.Union( - schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options + schema, node.options, *(translator.translate_ir(n=n) for n in node.inputs) ) @_translate_ir.register def _( - node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType] + node: pl_ir.HConcat, translator: Translator, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) - - -def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: - """ - Translate a polars-internal IR node to our representation. - - Parameters - ---------- - visitor - Polars NodeTraverser object - n - Optional node to start traversing from, if not provided uses - current polars-internal node. - - Returns - ------- - Translated IR object - - Raises - ------ - NotImplementedError - If we can't translate the nodes due to unsupported functionality. - """ - ctx: AbstractContextManager[None] = ( - set_node(visitor, n) if n is not None else noop_context - ) - # IR is versioned with major.minor, minor is bumped for backwards - # compatible changes (e.g. adding new nodes), major is bumped for - # incompatible changes (e.g. renaming nodes). - # Polars 1.7 changes definition of the CSV reader options schema name. - if (version := visitor.version()) >= (3, 0): - raise NotImplementedError( - f"No support for polars IR {version=}" - ) # pragma: no cover; no such version for now. - - with ctx: - polars_schema = visitor.get_schema() - node = visitor.view_current_node() - schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} - result = _translate_ir(node, visitor, schema) - if any( - isinstance(dtype, pl.Null) - for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) - ): - raise NotImplementedError( - f"No GPU support for {result} with Null column dtype." - ) - return result + return ir.HConcat(schema, *(translator.translate_ir(n=n) for n in node.inputs)) def translate_named_expr( - visitor: NodeTraverser, *, n: pl_expr.PyExprIR + translator: Translator, *, n: pl_expr.PyExprIR ) -> expr.NamedExpr: """ Translate a polars-internal named expression IR object into our representation. Parameters ---------- - visitor - Polars NodeTraverser object + translator + Translator object n Node to translate, a named expression node. @@ -359,12 +497,12 @@ def translate_named_expr( NotImplementedError If any translation fails due to unsupported functionality. """ - return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node)) + return expr.NamedExpr(n.output_name, translator.translate_expr(n=n.node)) @singledispatch def _translate_expr( - node: Any, visitor: NodeTraverser, dtype: plc.DataType + node: Any, translator: Translator, dtype: plc.DataType ) -> expr.Expr: raise NotImplementedError( f"Translation for {type(node).__name__}" @@ -372,7 +510,7 @@ def _translate_expr( @_translate_expr.register -def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): @@ -381,7 +519,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex pl_expr.StringFunction.StripCharsStart, pl_expr.StringFunction.StripCharsEnd, }: - column, chars = (translate_expr(visitor, n=n) for n in node.input) + column, chars = (translator.translate_expr(n=n) for n in node.input) if isinstance(chars, expr.Literal): if chars.value == pa.scalar(""): # No-op in polars, but libcudf uses empty string @@ -398,11 +536,11 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) elif isinstance(name, pl_expr.BooleanFunction): if name == pl_expr.BooleanFunction.IsBetween: - column, lo, hi = (translate_expr(visitor, n=n) for n in node.input) + column, lo, hi = (translator.translate_expr(n=n) for n in node.input) (closed,) = options lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed] return expr.BinOp( @@ -415,7 +553,7 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) elif isinstance(name, pl_expr.TemporalFunction): # functions for which evaluation of the expression may not return @@ -435,14 +573,14 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex dtype, name, options, - *(translate_expr(visitor, n=n) for n in node.input), + *(translator.translate_expr(n=n) for n in node.input), ) if name in needs_cast: return expr.Cast(dtype, result_expr) return result_expr elif isinstance(name, str): - children = (translate_expr(visitor, n=n) for n in node.input) + children = (translator.translate_expr(n=n) for n in node.input) if name == "log": (base,) = options (child,) = children @@ -461,26 +599,26 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex @_translate_expr.register -def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Window, translator: Translator, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby? if isinstance(node.options, pl_expr.RollingGroupOptions): # pl.col("a").rolling(...) return expr.RollingWindow( - dtype, node.options, translate_expr(visitor, n=node.function) + dtype, node.options, translator.translate_expr(n=node.function) ) elif isinstance(node.options, pl_expr.WindowMapping): # pl.col("a").over(...) return expr.GroupedRollingWindow( dtype, node.options, - translate_expr(visitor, n=node.function), - *(translate_expr(visitor, n=n) for n in node.partition_by), + translator.translate_expr(n=node.function), + *(translator.translate_expr(n=n) for n in node.partition_by), ) assert_never(node.options) @_translate_expr.register -def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Literal, translator: Translator, dtype: plc.DataType) -> expr.Expr: if isinstance(node.value, plrs.PySeries): return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value)) value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) @@ -488,42 +626,42 @@ def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> exp @_translate_expr.register -def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby - return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) + return expr.Sort(dtype, node.options, translator.translate_expr(n=node.expr)) @_translate_expr.register -def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.SortBy( dtype, node.sort_options, - translate_expr(visitor, n=node.expr), - *(translate_expr(visitor, n=n) for n in node.by), + translator.translate_expr(n=node.expr), + *(translator.translate_expr(n=n) for n in node.by), ) @_translate_expr.register -def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Gather, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Gather( dtype, - translate_expr(visitor, n=node.expr), - translate_expr(visitor, n=node.idx), + translator.translate_expr(n=node.expr), + translator.translate_expr(n=node.idx), ) @_translate_expr.register -def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Filter, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Filter( dtype, - translate_expr(visitor, n=node.input), - translate_expr(visitor, n=node.by), + translator.translate_expr(n=node.input), + translator.translate_expr(n=node.by), ) @_translate_expr.register -def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - inner = translate_expr(visitor, n=node.expr) +def _(node: pl_expr.Cast, translator: Translator, dtype: plc.DataType) -> expr.Expr: + inner = translator.translate_expr(n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) @@ -535,17 +673,17 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E @_translate_expr.register -def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Column, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Col(dtype, node.name) @_translate_expr.register -def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Agg, translator: Translator, dtype: plc.DataType) -> expr.Expr: value = expr.Agg( dtype, node.name, node.options, - *(translate_expr(visitor, n=n) for n in node.arguments), + *(translator.translate_expr(n=n) for n in node.arguments), ) if value.name == "count" and value.dtype.id() != plc.TypeId.INT32: return expr.Cast(value.dtype, value) @@ -553,55 +691,30 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex @_translate_expr.register -def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Ternary, translator: Translator, dtype: plc.DataType) -> expr.Expr: return expr.Ternary( dtype, - translate_expr(visitor, n=node.predicate), - translate_expr(visitor, n=node.truthy), - translate_expr(visitor, n=node.falsy), + translator.translate_expr(n=node.predicate), + translator.translate_expr(n=node.truthy), + translator.translate_expr(n=node.falsy), ) @_translate_expr.register def _( - node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType + node: pl_expr.BinaryExpr, translator: Translator, dtype: plc.DataType ) -> expr.Expr: return expr.BinOp( dtype, expr.BinOp._MAPPING[node.op], - translate_expr(visitor, n=node.left), - translate_expr(visitor, n=node.right), + translator.translate_expr(n=node.left), + translator.translate_expr(n=node.right), ) @_translate_expr.register -def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Len, translator: Translator, dtype: plc.DataType) -> expr.Expr: value = expr.Len(dtype) if dtype.id() != plc.TypeId.INT32: return expr.Cast(dtype, value) return value # pragma: no cover; never reached since polars len has uint32 dtype - - -def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: - """ - Translate a polars-internal expression IR into our representation. - - Parameters - ---------- - visitor - Polars NodeTraverser object - n - Node to translate, an integer referencing a polars internal node. - - Returns - ------- - Translated IR object. - - Raises - ------ - NotImplementedError - If any translation fails due to unsupported functionality. - """ - node = visitor.view_expression(n) - dtype = dtypes.from_polars(visitor.get_dtype(n)) - return _translate_expr(node, visitor, dtype) diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py new file mode 100644 index 00000000000..be8338cb9a9 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/traversal.py @@ -0,0 +1,175 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Traversal and visitor utilities for nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Generic + +from cudf_polars.typing import U_contra, V_co + +if TYPE_CHECKING: + from collections.abc import Callable, Generator, Mapping, MutableMapping + + from cudf_polars.typing import GenericTransformer, NodeT + + +__all__: list[str] = [ + "traversal", + "reuse_if_unchanged", + "make_recursive", + "CachingVisitor", +] + + +def traversal(node: NodeT) -> Generator[NodeT, None, None]: + """ + Pre-order traversal of nodes in an expression. + + Parameters + ---------- + node + Root of expression to traverse. + + Yields + ------ + Unique nodes in the expression, parent before child, children + in-order from left to right. + """ + seen = {node} + lifo = [node] + + while lifo: + node = lifo.pop() + yield node + for child in reversed(node.children): + if child not in seen: + seen.add(child) + lifo.append(child) + + +def reuse_if_unchanged(node: NodeT, fn: GenericTransformer[NodeT, NodeT]) -> NodeT: + """ + Recipe for transforming nodes that returns the old object if unchanged. + + Parameters + ---------- + node + Node to recurse on + fn + Function to transform children + + Notes + ----- + This can be used as a generic "base case" handler when + writing transforms that take nodes and produce new nodes. + + Returns + ------- + Existing node `e` if transformed children are unchanged, otherwise + reconstructed node with new children. + """ + new_children = [fn(c) for c in node.children] + if all(new == old for new, old in zip(new_children, node.children, strict=True)): + return node + return node.reconstruct(new_children) + + +def make_recursive( + fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co], + *, + state: Mapping[str, Any] | None = None, +) -> GenericTransformer[U_contra, V_co]: + """ + No-op wrapper for recursive visitors. + + Facilitates using visitors that don't need caching but are written + in the same style. + + Parameters + ---------- + fn + Function to transform inputs to outputs. Should take as its + second argument a callable from input to output. + state + Arbitrary *immutable* state that should be accessible to the + visitor through the `state` property. + + Notes + ----- + All transformation functions *must* be free of side-effects. + + Usually, prefer a :class:`CachingVisitor`, but if we know that we + don't need caching in a transformation and then this no-op + approach is slightly cheaper. + + Returns + ------- + Recursive function without caching. + + See Also + -------- + CachingVisitor + """ + + def rec(node: U_contra) -> V_co: + return fn(node, rec) # type: ignore[arg-type] + + rec.state = state if state is not None else {} # type: ignore[attr-defined] + return rec # type: ignore[return-value] + + +class CachingVisitor(Generic[U_contra, V_co]): + """ + Caching wrapper for recursive visitors. + + Facilitates writing visitors where already computed results should + be cached and reused. The cache is managed automatically, and is + tied to the lifetime of the wrapper. + + Parameters + ---------- + fn + Function to transform inputs to outputs. Should take as its + second argument the recursive cache manager. + state + Arbitrary *immutable* state that should be accessible to the + visitor through the `state` property. + + Notes + ----- + All transformation functions *must* be free of side-effects. + + Returns + ------- + Recursive function with caching. + """ + + def __init__( + self, + fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co], + *, + state: Mapping[str, Any] | None = None, + ) -> None: + self.fn = fn + self.cache: MutableMapping[U_contra, V_co] = {} + self.state = state if state is not None else {} + + def __call__(self, value: U_contra) -> V_co: + """ + Apply the function to a value. + + Parameters + ---------- + value + The value to transform. + + Returns + ------- + A transformed value. + """ + try: + return self.cache[value] + except KeyError: + return self.cache.setdefault(value, self.fn(value, self)) diff --git a/python/cudf_polars/cudf_polars/experimental/__init__.py b/python/cudf_polars/cudf_polars/experimental/__init__.py new file mode 100644 index 00000000000..6fd93bf5157 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Experimental features, which can change without any deprecation period.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/experimental/dask_serialize.py b/python/cudf_polars/cudf_polars/experimental/dask_serialize.py new file mode 100644 index 00000000000..aae78e07690 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/dask_serialize.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Dask serialization.""" + +from __future__ import annotations + +from distributed.protocol import dask_deserialize, dask_serialize +from distributed.protocol.cuda import cuda_deserialize, cuda_serialize +from distributed.utils import log_errors + +import pylibcudf as plc +import rmm + +from cudf_polars.containers import DataFrame + +__all__ = ["register"] + + +def register() -> None: + """Register dask serialization routines for DataFrames.""" + + @cuda_serialize.register(DataFrame) + def _(x: DataFrame): + with log_errors(): + header, frames = x.serialize() + return header, list(frames) # Dask expect a list of frames + + @cuda_deserialize.register(DataFrame) + def _(header, frames): + with log_errors(): + assert len(frames) == 2 + return DataFrame.deserialize(header, tuple(frames)) + + @dask_serialize.register(DataFrame) + def _(x: DataFrame): + with log_errors(): + header, (metadata, gpudata) = x.serialize() + + # For robustness, we check that the gpu data is contiguous + cai = gpudata.__cuda_array_interface__ + assert len(cai["shape"]) == 1 + assert cai["strides"] is None or cai["strides"] == (1,) + assert cai["typestr"] == "|u1" + nbytes = cai["shape"][0] + + # Copy the gpudata to host memory + gpudata_on_host = memoryview( + rmm.DeviceBuffer(ptr=gpudata.ptr, size=nbytes).copy_to_host() + ) + return header, (metadata, gpudata_on_host) + + @dask_deserialize.register(DataFrame) + def _(header, frames) -> DataFrame: + with log_errors(): + assert len(frames) == 2 + # Copy the second frame (the gpudata in host memory) back to the gpu + frames = frames[0], plc.gpumemoryview(rmm.DeviceBuffer.to_device(frames[1])) + return DataFrame.deserialize(header, frames) diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 7b6f3848fc4..ba0bb12a0fb 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -10,7 +10,7 @@ from polars import GPUEngine from polars.testing.asserts import assert_frame_equal -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator if TYPE_CHECKING: import polars as pl @@ -23,6 +23,7 @@ def assert_gpu_result_equal( lazydf: pl.LazyFrame, *, + engine: GPUEngine | None = None, collect_kwargs: dict[OptimizationArgs, bool] | None = None, polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None, cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None, @@ -41,6 +42,8 @@ def assert_gpu_result_equal( ---------- lazydf frame to collect. + engine + Custom GPU engine configuration. collect_kwargs Common keyword arguments to pass to collect for both polars CPU and cudf-polars. @@ -76,12 +79,14 @@ def assert_gpu_result_equal( NotImplementedError If GPU collection failed in some way. """ + if engine is None: + engine = GPUEngine(raise_on_fail=True) + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs ) expect = lazydf.collect(**final_polars_collect_kwargs) - engine = GPUEngine(raise_on_fail=True) got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine) assert_frame_equal( expect, @@ -117,12 +122,14 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) AssertionError If the specified exceptions were not raised. """ - try: - _ = translate_ir(q._ldf.visit()) - except exceptions: + translator = Translator(q._ldf.visit(), GPUEngine()) + translator.translate_ir() + if errors := translator.errors: + for err in errors: + assert any( + isinstance(err, err_type) for err_type in exceptions + ), f"Translation DID NOT RAISE {exceptions}" return - except Exception as e: - raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e else: raise AssertionError(f"Translation DID NOT RAISE {exceptions}") @@ -151,7 +158,7 @@ def assert_collect_raises( collect_kwargs: dict[OptimizationArgs, bool] | None = None, polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None, cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None, -): +) -> None: """ Assert that collecting the result of a query raises the expected exceptions. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 05b76d76808..7a759eea2e9 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -16,7 +16,7 @@ from collections.abc import Mapping -def pytest_addoption(parser: pytest.Parser): +def pytest_addoption(parser: pytest.Parser) -> None: """Add plugin-specific options.""" group = parser.getgroup( "cudf-polars", "Plugin to set GPU as default engine for polars tests" @@ -28,7 +28,7 @@ def pytest_addoption(parser: pytest.Parser): ) -def pytest_configure(config: pytest.Config): +def pytest_configure(config: pytest.Config) -> None: """Enable use of this module as a pytest plugin to enable GPU collection.""" no_fallback = config.getoption("--cudf-polars-no-fallback") collect = polars.LazyFrame.collect @@ -40,25 +40,50 @@ def pytest_configure(config: pytest.Config): ) config.addinivalue_line( "filterwarnings", - "ignore:.*Query execution with GPU not supported", + "ignore:.*Query execution with GPU not possible", ) EXPECTED_FAILURES: Mapping[str, str] = { "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning", + "tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning", + "tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning", "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read", "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing", "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394", "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match", @@ -107,6 +132,14 @@ def pytest_configure(config: pytest.Config): "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero", "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", @@ -124,22 +157,17 @@ def pytest_configure(config: pytest.Config): "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information", "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong", "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv", "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception", "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", + "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised", # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", @@ -148,7 +176,7 @@ def pytest_configure(config: pytest.Config): def pytest_collection_modifyitems( session: pytest.Session, config: pytest.Config, items: list[pytest.Item] -): +) -> None: """Mark known failing tests.""" if config.getoption("--cudf-polars-no-fallback"): # Don't xfail tests if running without fallback diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index 240b11bdf59..57c5fdaa7cf 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -5,20 +5,32 @@ from __future__ import annotations -from collections.abc import Mapping -from typing import TYPE_CHECKING, Literal, Protocol, Union - -import pylibcudf as plc +from collections.abc import Hashable, Mapping +from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir +import pylibcudf as plc + if TYPE_CHECKING: from collections.abc import Callable from typing import TypeAlias import polars as pl -IR: TypeAlias = Union[ + from cudf_polars.dsl import expr, ir, nodebase + +__all__: list[str] = [ + "PolarsIR", + "PolarsExpr", + "NodeTraverser", + "OptimizationArgs", + "GenericTransformer", + "ExprTransformer", + "IRTransformer", +] + +PolarsIR: TypeAlias = Union[ pl_ir.PythonScan, pl_ir.Scan, pl_ir.Cache, @@ -38,7 +50,7 @@ pl_ir.ExtContext, ] -Expr: TypeAlias = Union[ +PolarsExpr: TypeAlias = Union[ pl_expr.Function, pl_expr.Window, pl_expr.Literal, @@ -68,7 +80,7 @@ def set_node(self, n: int) -> None: """Set the current plan node to n.""" ... - def view_current_node(self) -> IR: + def view_current_node(self) -> PolarsIR: """Convert current plan node to python rep.""" ... @@ -80,7 +92,7 @@ def get_dtype(self, n: int) -> pl.DataType: """Get the datatype of the given expression id.""" ... - def view_expression(self, n: int) -> Expr: + def view_expression(self, n: int) -> PolarsExpr: """Convert the given expression to python rep.""" ... @@ -107,3 +119,29 @@ def set_udf( "cluster_with_columns", "no_optimization", ] + + +U_contra = TypeVar("U_contra", bound=Hashable, contravariant=True) +V_co = TypeVar("V_co", covariant=True) +NodeT = TypeVar("NodeT", bound="nodebase.Node[Any]") + + +class GenericTransformer(Protocol[U_contra, V_co]): + """Abstract protocol for recursive visitors.""" + + def __call__(self, __value: U_contra) -> V_co: + """Apply the visitor to the node.""" + ... + + @property + def state(self) -> Mapping[str, Any]: + """Arbitrary immutable state.""" + ... + + +# Quotes to avoid circular import +ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"] +"""Protocol for transformation of Expr nodes.""" + +IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"] +"""Protocol for transformation of IR nodes.""" diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 4154a404e98..e7ac72df609 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -8,12 +8,23 @@ from functools import cache import pyarrow as pa -import pylibcudf as plc from typing_extensions import assert_never import polars as pl -__all__ = ["from_polars", "downcast_arrow_lists", "can_cast"] +from pylibcudf.traits import ( + is_floating_point, + is_integral_not_bool, + is_numeric_not_bool, +) + +__all__ = [ + "from_polars", + "downcast_arrow_lists", + "can_cast", + "is_order_preserving_cast", +] +import pylibcudf as plc def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: @@ -60,10 +71,60 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: ------- True if casting is supported, False otherwise """ + has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY return ( - plc.traits.is_fixed_width(to) - and plc.traits.is_fixed_width(from_) - and plc.unary.is_supported_cast(from_, to) + ( + from_ == to + or not has_empty + and ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) + ) + or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) + or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_)) + ) + + +def is_order_preserving_cast(from_: plc.DataType, to: plc.DataType) -> bool: + """ + Determine if a cast would preserve the order of the source data. + + Parameters + ---------- + from_ + Source datatype + to + Target datatype + + Returns + ------- + True if the cast is order-preserving, False otherwise + """ + if from_.id() == to.id(): + return True + + if is_integral_not_bool(from_) and is_integral_not_bool(to): + # True if signedness is the same and the target is larger + if plc.traits.is_unsigned(from_) == plc.traits.is_unsigned(to): + if plc.types.size_of(to) >= plc.types.size_of(from_): + return True + elif (plc.traits.is_unsigned(from_) and not plc.traits.is_unsigned(to)) and ( + plc.types.size_of(to) > plc.types.size_of(from_) + ): + # Unsigned to signed is order preserving if target is large enough + # But signed to unsigned is never order preserving due to negative values + return True + elif ( + is_floating_point(from_) + and is_floating_point(to) + and (plc.types.size_of(to) >= plc.types.size_of(from_)) + ): + # True if the target is larger + return True + return (is_integral_not_bool(from_) and is_floating_point(to)) or ( + is_floating_point(from_) and is_integral_not_bool(to) ) diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 4a7ad6b3cf2..b08cede8f7f 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -12,11 +12,14 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8") +POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11") +POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12") +POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12") +POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13") def _ensure_polars_version(): - if POLARS_VERSION_LT_18: + if POLARS_VERSION_LT_111: raise ImportError( - "cudf_polars requires py-polars v1.8 or greater." + "cudf_polars requires py-polars v1.11 or greater." ) # pragma: no cover diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 103ac1a674e..2231dd34e35 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -8,17 +8,20 @@ You will need: preferred configuration. Or else, use [rustup](https://www.rust-lang.org/tools/install) 2. A [cudf development - environment](https://github.com/rapidsai/cudf/blob/branch-24.10/CONTRIBUTING.md#setting-up-your-build-environment). + environment](https://github.com/rapidsai/cudf/blob/branch-24.12/CONTRIBUTING.md#setting-up-your-build-environment). The combined devcontainer works, or whatever your favourite approach is. -> ![NOTE] These instructions will get simpler as we merge code in. +:::{note} +These instructions will get simpler as we merge code in. +::: ## Installing polars -`cudf-polars` works with polars >= 1.3, as long as the internal IR -version doesn't get a major version bump. So `pip install polars>=1.3` -should work. For development, if we're adding things to the polars -side of things, we will need to build polars from source: +The `cudf-polars` `pyproject.toml` advertises which polars versions it +works with. So for pure `cudf-polars` development, installing as +normal and satisfying the dependencies in the repository is +sufficient. For development, if we're adding things to the polars side +of things, we will need to build polars from source: ```sh git clone https://github.com/pola-rs/polars @@ -36,7 +39,9 @@ pip install --upgrade uv uv pip install --upgrade -r py-polars/requirements-dev.txt ``` -> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster! +:::{note} +plain `pip install` works fine, but `uv` is _much_ faster! +::: Now we have the necessary machinery to build polars ```sh @@ -83,7 +88,7 @@ representation (IR). Second, an execution phase which executes using our IR. The translation phase receives the a low-level Rust `NodeTraverser` -object which delivers Python representations of the plan nodes (and +object that delivers Python representations of the plan nodes (and expressions) one at a time. During translation, we endeavour to raise `NotImplementedError` for any unsupported functionality. This way, if we can't execute something, we just don't modify the logical plan at @@ -126,7 +131,6 @@ arguments, at the moment, `raise_on_fail` is also supported, which raises, rather than falling back, during translation: ```python - result = q.collect(engine=pl.GPUEngine(raise_on_fail=True)) ``` @@ -144,13 +148,77 @@ changes. We can therefore attempt to detect the IR version appropriately. This should be done during IR translation in `translate.py`. -## Adding a handler for a new plan node +# IR design + +As noted, we translate the polars DSL into our own IR. This is both so +that we can smooth out minor version differences (advertised by +`NodeTraverser` version changes) within `cudf-polars`, and so that we +have the freedom to introduce new IR nodes and rewrite rules as might +be appropriate for GPU execution. + +To that end, we provide facilities for definition of nodes as well as +writing traversals and rewrite rules. The abstract base class `Node` +in `dsl/nodebase.py` defines the interface for implementing new nodes, +and provides many useful default methods. See also the docstrings of +the `Node` class. + +:::{note} +This generic implementation relies on nodes being treated as +*immutable*. Do not implement in-place modification of nodes, bad +things will happen. +::: + +## Defining nodes + +A concrete node type (`cudf-polars` has expression nodes, `Expr`; +and plan nodes, `IR`), should inherit from `Node`. Nodes have +two types of data: + +1. `children`: a tuple (possibly empty) of concrete nodes; +2. non-child: arbitrary data attached to the node that is _not_ a + concrete node. + +The base `Node` class requires that one advertise the names of the +non-child attributes in the `_non_child` class variable. The +constructor of the concrete node should take its arguments in the +order `*_non_child` (ordered as the class variable does) and then +`*children`. For example, the `Sort` node, which sorts a column +generated by an expression, has this definition: + +```python +class Expr(Node): + children: tuple[Expr, ...] + +class Sort(Expr): + _non_child = ("dtype", "options") + children: tuple[Expr] + def __init__(self, dtype, options, column: Expr): + self.dtype = dtype + self.options = options + self.children = (column,) +``` -Plan node definitions live in `cudf_polars/dsl/ir.py`, these are -`dataclasses` that inherit from the base `IR` node. The evaluation of -a plan node is done by implementing the `evaluate` method. +By following this pattern, we get an automatic (caching) +implementation of `__hash__` and `__eq__`, as well as a useful +`reconstruct` method that will rebuild the node with new children. -To translate the plan node, add a case handler in `translate_ir` which +If you want to control the behaviour of `__hash__` and `__eq__` for a +single node, override (respectively) the `get_hashable` and `is_equal` +methods. + +## Adding new translation rules from the polars IR + +### Plan nodes + +Plan node definitions live in `cudf_polars/dsl/ir.py`, these all +inherit from the base `IR` node. The evaluation of a plan node is done +by implementing the `do_evaluate` method. This method takes in +the non-child arguments specified in `_non_child_args`, followed by +pre-evaluated child nodes (`DataFrame` objects). To perform the +evaluation, one should use the base class (generic) `evaluate` method +which handles the recursive evaluation of child nodes. + +To translate the plan node, add a case handler in `translate_ir` that lives in `cudf_polars/dsl/translate.py`. As well as child nodes that are plans, most plan nodes contain child @@ -163,25 +231,12 @@ translating a `Join` node, the left keys (expressions) should be translated with the left input active (and right keys with right input). To facilitate this, use the `set_node` context manager. -## Adding a handler for a new expression node +### Expression nodes Adding a handle for an expression node is very similar to a plan node. -Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit -from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it -is simpler for us to implement efficient hashing, repr, and equality if we -can write that ourselves. - -Every expression consists of two types of data: -1. child data (other `Expr`s) -2. non-child data (anything other than an `Expr`) -The generic implementations of special methods in the base `Expr` base -class require that the subclasses advertise which arguments to the -constructor are non-child in a `_non_child` class slot. The -constructor should then take arguments: -```python -def __init__(self, *non_child_data: Any, *children: Expr): -``` -Read the docstrings in the `Expr` class for more details. +Expressions are defined in `cudf_polars/dsl/expressions/` and exported +into the `dsl` namespace via `expr.py`. They inherit +from `Expr`. Expressions are evaluated by implementing a `do_evaluate` method that takes a `DataFrame` as context (this provides columns) along with an @@ -198,24 +253,142 @@ To simplify state tracking, all columns should be considered immutable on construction. This matches the "functional" description coming from the logical plan in any case, so is reasonably natural. +## Traversing and transforming nodes + +In addition to representing and evaluating nodes. We also provide +facilities for traversing a tree of nodes and defining transformation +rules in `dsl/traversal.py`. The simplest is `traversal`, a +[pre-order](https://en.wikipedia.org/wiki/Tree_traversal) visit of all +unique nodes in an expression. Use this if you want to know some +specific thing about an expression. For example, to determine if an +expression contains a `Literal` node: + +```python +def has_literal(node: Expr) -> bool: + return any(isinstance(e, Literal) for e in traversal(node)) +``` + +It is often convenient to provide (immutable) state to a visitor, as +well as some facility to perform DAG-aware rewrites (reusing a +transformation for an expression if we have already seen it). We +therefore adopt the following pattern of writing DAG-aware visitors. +Suppose we want a rewrite rule (`rewrite`) between expressions +(`Expr`) and some new type `T`. We define our general transformation +function `rewrite` with type `Expr -> (Expr -> T) -> T`: + +```python +from cudf_polars.typing import GenericTransformer + +@singledispatch +def rewrite(e: Expr, rec: GenericTransformer[Expr, T]) -> T: + ... +``` + +Note in particular that the function to perform the recursion is +passed as the second argument. Rather than defining methods on each +node in turn for a particular rewrite rule, we prefer free functions +and use `functools.singledispatch` to provide dispatching. We now, in +the usual fashion, register handlers for different expression types. +To use this function, we need to be able to provide both the +expression to convert and the recursive function itself. To do this we +must convert our `rewrite` function into something that only takes a +single argument (the expression to rewrite), but carries around +information about how to perform the recursion. To this end, we have +two utilities in `traversal.py`: + +- `make_recursive` and +- `CachingVisitor`. + +These both implement the `GenericTransformer` protocol, and can be +wrapped around a transformation function like `rewrite` to provide a +function `Expr -> T`. They also allow us to attach arbitrary +*immutable* state to our visitor by passing a `state` dictionary. This +dictionary can then be inspected by the concrete transformation +function. `make_recursive` is very simple, and provides no caching of +intermediate results (so any DAGs that are visited will be viewed as +trees). `CachingVisitor` provides the same interface, but maintains a +cache of intermediate results, and reuses them if the same expression +is seen again. + +Finally, for writing transformations that take nodes and deliver new +nodes (e.g. rewrite rules), we have a final utility +`reuse_if_unchanged` that can be used as a base case transformation +for node to node rewrites. It is a depth-first visit that transforms +children but only returns a new node with new children if the rewrite +of children returned new nodes. + +To see how these pieces fit together, let us consider writing a +`rename` function that takes an expression (potentially with +references to columns) along with a mapping defining a renaming +between (some subset of) column names. The goal is to deliver a new +expression with appropriate columns renamed. + +To start, we define the dispatch function +```python +from collections.abc import Mapping +from functools import singledispatch +from cudf_polars.dsl.traversal import ( + CachingVisitor, make_recursive, reuse_if_unchanged +) +from cudf_polars.dsl.expr import Col, Expr +from cudf_polars.typing import ExprTransformer + + +@singledispatch +def _rename(e: Expr, rec: ExprTransformer) -> Expr: + raise NotImplementedError(f"No handler for {type(e)}") +``` +then we register specific handlers, first for columns: +```python +@_rename.register +def _(e: Col, rec: ExprTransformer) -> Expr: + mapping = rec.state["mapping"] # state set on rec + if e.name in mapping: + # If we have a rename, return a new Col reference + # with a new name + return type(e)(e.dtype, mapping[e.name]) + return e +``` +and then for the remaining expressions +```python +_rename.register(Expr)(reuse_if_unchanged) +``` + +:::{note} +In this case, we could have put the generic handler in the `_rename` +function, however, then we would not get a nice error message if we +accidentally sent in an object of the incorrect type. +::: + +Finally we tie everything together with a public function: + +```python +def rename(e: Expr, mapping: Mapping[str, str]) -> Expr: + """Rename column references in an expression.""" + mapper = CachingVisitor(_rename, state={"mapping": mapping}) + # or + # mapper = make_recursive(_rename, state={"mapping": mapping}) + return mapper(e) +``` + # Containers Containers should be constructed as relatively lightweight objects -around their pylibcudf counterparts. We have four (in +around their pylibcudf counterparts. We have three (in `cudf_polars/containers/`): 1. `Scalar` (a wrapper around a pylibcudf `Scalar`) 2. `Column` (a wrapper around a pylibcudf `Column`) -3. `NamedColumn` (a `Column` with an additional name) -4. `DataFrame` (a wrapper around a pylibcudf `Table`) +3. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly -speaking, a `DataFrame` is just a list of `NamedColumn`s which each -hold a `Column` plus a string `name`. `NamedColumn`s are only ever -constructed via `NamedExpr`s, which are the top-level expression node -that lives inside an `IR` node. This means that the expression -evaluator never has to concern itself with column names: columns are -only ever decorated with names when constructing a `DataFrame`. +speaking, a `DataFrame` is just a mapping from string `name`s to +`Column`s, and thus also holds a pylibcudf `Table`. Names are only +attached to `Column`s and hence inserted into `DataFrames` via +`NamedExpr`s, which are the top-level expression nodes that live +inside an `IR` node. This means that the expression evaluator never +has to concern itself with column names: columns are only ever +decorated with names when constructing a `DataFrame`. The columns keep track of metadata (for example, whether or not they are sorted). We could imagine tracking more metadata, like minimum and @@ -285,12 +458,13 @@ translate it to our intermediate representation (IR), and then execute and convert back to polars: ```python -from cudf_polars.dsl.translate import translate_ir +from cudf_polars.dsl.translate import Translator +import polars as pl q = ... # Convert to our IR -ir = translate_ir(q._ldf.visit()) +ir = Translator(q._ldf.visit(), pl.GPUEngine()).translate_ir() # DataFrame living on the device result = ir.evaluate(cache={}) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index c041744d4b3..1ce4d7b6867 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,8 +19,8 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.8,<1.9", - "pylibcudf==24.10.*", + "polars>=1.11,<1.15", + "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -35,10 +35,14 @@ classifiers = [ [project.optional-dependencies] test = [ + "numpy>=1.23,<3.0a0", "pytest-cov", "pytest-xdist", "pytest<8", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +experimental = [ + "rapids-dask-dependency==24.12.*,>=0.0.0a0", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] Homepage = "https://github.com/rapidsai/cudf" @@ -49,13 +53,28 @@ license-files = ["LICENSE"] [tool.setuptools.dynamic] version = {file = "cudf_polars/VERSION"} +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", +] xfail_strict = true [tool.coverage.report] exclude_also = [ "if TYPE_CHECKING:", - "class .*\\bProtocol\\):", + "class .*\\bProtocol(?:\\[[^]]+\\])?\\):", "assert_never\\(" ] # The cudf_polars test suite doesn't exercise the plugin, so we omit @@ -183,7 +202,7 @@ required-imports = ["from __future__ import annotations"] [tool.ruff.lint.isort.sections] polars = ["polars"] -rapids = ["rmm", "cudf"] +rapids = ["rmm", "pylibcudf"] [tool.ruff.format] docstring-code-format = true diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py index 19919877f84..95541b4ecc3 100644 --- a/python/cudf_polars/tests/containers/test_column.py +++ b/python/cudf_polars/tests/containers/test_column.py @@ -3,13 +3,12 @@ from __future__ import annotations -from functools import partial - import pyarrow -import pylibcudf as plc import pytest -from cudf_polars.containers import Column, NamedColumn +import pylibcudf as plc + +from cudf_polars.containers import Column def test_non_scalar_access_raises(): @@ -55,11 +54,10 @@ def test_shallow_copy(): @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32]) -@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")]) -def test_mask_nans(typeid, constructor): +def test_mask_nans(typeid): dtype = plc.DataType(typeid) values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype)) - column = constructor(plc.interop.from_arrow(values)) + column = Column(plc.interop.from_arrow(values)) masked = column.mask_nans() assert column.obj.null_count() == masked.obj.null_count() diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 39fb44d55a5..83729371a9c 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -3,23 +3,26 @@ from __future__ import annotations -import pylibcudf as plc +import pyarrow as pa import pytest import polars as pl +from polars.testing.asserts import assert_frame_equal + +import pylibcudf as plc -from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.containers import Column, DataFrame from cudf_polars.testing.asserts import assert_gpu_result_equal def test_select_missing_raises(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) @@ -30,17 +33,17 @@ def test_select_missing_raises(): def test_replace_missing_raises(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) - replacement = df.columns[0].copy(new_name="b") + replacement = df.column_map["a"].copy().rename("b") with pytest.raises(ValueError): - df.replace_columns(replacement) + df.with_columns([replacement], replace_only=True) def test_from_table_wrong_names(): @@ -55,14 +58,23 @@ def test_from_table_wrong_names(): DataFrame.from_table(table, ["a", "b"]) +def test_unnamed_column_raise(): + payload = plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 0, plc.MaskState.ALL_VALID + ) + + with pytest.raises(ValueError): + DataFrame([Column(payload, name="a"), Column(payload)]) + + def test_sorted_like_raises_mismatching_names(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) @@ -72,11 +84,11 @@ def test_sorted_like_raises_mismatching_names(): def test_shallow_copy(): - column = NamedColumn( + column = Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) column.set_sorted( is_sorted=plc.types.Sorted.YES, @@ -85,13 +97,13 @@ def test_shallow_copy(): ) df = DataFrame([column]) copy = df.copy() - copy.columns[0].set_sorted( + copy.column_map["a"].set_sorted( is_sorted=plc.types.Sorted.NO, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, ) - assert df.columns[0].is_sorted == plc.types.Sorted.YES - assert copy.columns[0].is_sorted == plc.types.Sorted.NO + assert df.column_map["a"].is_sorted == plc.types.Sorted.YES + assert copy.column_map["a"].is_sorted == plc.types.Sorted.NO def test_sorted_flags_preserved_empty(): @@ -100,7 +112,7 @@ def test_sorted_flags_preserved_empty(): gf = DataFrame.from_polars(df) - (a,) = gf.columns + a = gf.column_map["a"] assert a.is_sorted == plc.types.Sorted.YES @@ -151,3 +163,24 @@ def test_empty_name_roundtrips_overlap(): def test_empty_name_roundtrips_no_overlap(): df = pl.LazyFrame({"": [1, 2, 3], "b": [4, 5, 6]}) assert_gpu_result_equal(df) + + +@pytest.mark.parametrize( + "arrow_tbl", + [ + pa.table([]), + pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), + pa.table({"a": [1, 2, 3]}), + pa.table({"a": [1], "b": [2], "c": [3]}), + pa.table({"a": ["a", "bb", "ccc"]}), + pa.table({"a": [1, 2, None], "b": [None, 3, 4]}), + ], +) +def test_serialization_roundtrip(arrow_tbl): + plc_tbl = plc.interop.from_arrow(arrow_tbl) + df = DataFrame.from_table(plc_tbl, names=arrow_tbl.column_names) + + header, frames = df.serialize() + res = DataFrame.deserialize(header, frames) + + assert_frame_equal(df.to_polars(), res.to_polars()) diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py index b7d4672daca..de8fec301fe 100644 --- a/python/cudf_polars/tests/dsl/test_expr.py +++ b/python/cudf_polars/tests/dsl/test_expr.py @@ -3,9 +3,10 @@ from __future__ import annotations -import pylibcudf as plc import pytest +import pylibcudf as plc + from cudf_polars.dsl import expr @@ -73,3 +74,24 @@ def test_namedexpr_repr_stable(): b2 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a")) assert repr(b1) == repr(b2) + + +def test_equality_cse(): + dt = plc.DataType(plc.TypeId.INT8) + + def make_expr(n1, n2): + a = expr.Col(plc.DataType(plc.TypeId.INT8), n1) + b = expr.Col(plc.DataType(plc.TypeId.INT8), n2) + + return expr.BinOp(dt, plc.binaryop.BinaryOperator.ADD, a, b) + + e1 = make_expr("a", "b") + e2 = make_expr("a", "b") + e3 = make_expr("a", "c") + + assert e1.children is not e2.children + assert e1 == e2 + assert e1.children is e2.children + assert e1 == e2 + assert e1 != e3 + assert e2 != e3 diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py new file mode 100644 index 00000000000..60ff7a655e6 --- /dev/null +++ b/python/cudf_polars/tests/dsl/test_to_ast.py @@ -0,0 +1,113 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pyarrow as pa +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + +import pylibcudf as plc + +import cudf_polars.dsl.expr as expr_nodes +import cudf_polars.dsl.ir as ir_nodes +from cudf_polars import Translator +from cudf_polars.containers.dataframe import DataFrame, NamedColumn +from cudf_polars.dsl.to_ast import insert_colrefs, to_ast, to_parquet_filter + + +@pytest.fixture(scope="module") +def df(): + return pl.LazyFrame( + { + "c": ["a", "b", "c", "d", "e", "f"], + "a": [1, 2, 3, None, 4, 5], + "b": pl.Series([None, None, 3, float("inf"), 4, 0], dtype=pl.Float64), + "d": [False, True, True, None, False, False], + } + ) + + +@pytest.mark.parametrize( + "expr", + [ + pl.col("a").is_in([0, 1]), + pl.col("a").is_between(0, 2), + (pl.col("a") < pl.col("b")).not_(), + pl.lit(2) > pl.col("a"), + pl.lit(2) >= pl.col("a"), + pl.lit(2) < pl.col("a"), + pl.lit(2) <= pl.col("a"), + pl.lit(0) == pl.col("a"), + pl.lit(1) != pl.col("a"), + (pl.col("b") < pl.lit(2, dtype=pl.Float64).sqrt()), + (pl.col("a") >= pl.lit(2)) & (pl.col("b") > 0), + pl.col("a").is_null(), + pl.col("a").is_not_null(), + pl.col("b").is_finite(), + pytest.param( + pl.col("a").sin(), + marks=pytest.mark.xfail(reason="Need to insert explicit casts"), + ), + pl.col("b").cos(), + pl.col("a").abs().is_between(0, 2), + pl.col("a").ne_missing(pl.lit(None, dtype=pl.Int64)), + [pl.col("a") * 2, pl.col("b") + pl.col("a")], + pl.col("d").not_(), + ], +) +def test_compute_column(expr, df): + q = df.select(expr) + ir = Translator(q._ldf.visit(), pl.GPUEngine()).translate_ir() + + assert isinstance(ir, ir_nodes.Select) + table = ir.children[0].evaluate(cache={}) + name_to_index = {c.name: i for i, c in enumerate(table.columns)} + + def compute_column(e): + e_with_colrefs = insert_colrefs( + e.value, + table_ref=plc.expressions.TableReference.LEFT, + name_to_index=name_to_index, + ) + with pytest.raises(NotImplementedError): + e_with_colrefs.evaluate(table) + ast = to_ast(e_with_colrefs) + if ast is not None: + return NamedColumn( + plc.transform.compute_column(table.table, ast), name=e.name + ) + return e.evaluate(table) + + got = DataFrame(map(compute_column, ir.exprs)).to_polars() + + expect = q.collect() + + assert_frame_equal(expect, got) + + +def test_invalid_colref_construction_raises(): + literal = expr_nodes.Literal( + plc.DataType(plc.TypeId.INT8), pa.scalar(1, type=pa.int8()) + ) + with pytest.raises(TypeError): + expr_nodes.ColRef( + literal.dtype, 0, plc.expressions.TableReference.LEFT, literal + ) + + +def test_to_ast_without_colref_raises(): + col = expr_nodes.Col(plc.DataType(plc.TypeId.INT8), "a") + + with pytest.raises(TypeError): + to_ast(col) + + +def test_to_parquet_filter_with_colref_raises(): + col = expr_nodes.Col(plc.DataType(plc.TypeId.INT8), "a") + colref = expr_nodes.ColRef(col.dtype, 0, plc.expressions.TableReference.LEFT, col) + + with pytest.raises(TypeError): + to_parquet_filter(colref) diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py new file mode 100644 index 00000000000..2f4df9289f8 --- /dev/null +++ b/python/cudf_polars/tests/dsl/test_traversal.py @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from functools import singledispatch + +import polars as pl +from polars.testing import assert_frame_equal + +import pylibcudf as plc + +from cudf_polars import Translator +from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.traversal import ( + CachingVisitor, + make_recursive, + reuse_if_unchanged, + traversal, +) +from cudf_polars.typing import ExprTransformer, IRTransformer + + +def make_expr(dt, n1, n2): + a1 = expr.Col(dt, n1) + a2 = expr.Col(dt, n2) + + return expr.BinOp(dt, plc.binaryop.BinaryOperator.MUL, a1, a2) + + +def test_traversal_unique(): + dt = plc.DataType(plc.TypeId.INT8) + + e1 = make_expr(dt, "a", "a") + unique_exprs = list(traversal(e1)) + + assert len(unique_exprs) == 2 + assert set(unique_exprs) == {expr.Col(dt, "a"), e1} + assert unique_exprs == [e1, expr.Col(dt, "a")] + + e2 = make_expr(dt, "a", "b") + unique_exprs = list(traversal(e2)) + + assert len(unique_exprs) == 3 + assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2} + assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")] + + e3 = make_expr(dt, "b", "a") + unique_exprs = list(traversal(e3)) + + assert len(unique_exprs) == 3 + assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3} + assert unique_exprs == [e3, expr.Col(dt, "b"), expr.Col(dt, "a")] + + +def rename(e, rec): + mapping = rec.state["mapping"] + if isinstance(e, expr.Col) and e.name in mapping: + return type(e)(e.dtype, mapping[e.name]) + return reuse_if_unchanged(e, rec) + + +def test_caching_visitor(): + dt = plc.DataType(plc.TypeId.INT8) + + e1 = make_expr(dt, "a", "b") + + mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}}) + + renamed = mapper(e1) + assert renamed == make_expr(dt, "a", "c") + assert len(mapper.cache) == 3 + + e2 = make_expr(dt, "a", "a") + mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}}) + + renamed = mapper(e2) + assert renamed == make_expr(dt, "a", "a") + assert len(mapper.cache) == 2 + mapper = CachingVisitor(rename, state={"mapping": {"a": "c"}}) + + renamed = mapper(e2) + assert renamed == make_expr(dt, "c", "c") + assert len(mapper.cache) == 2 + + +def test_noop_visitor(): + dt = plc.DataType(plc.TypeId.INT8) + + e1 = make_expr(dt, "a", "b") + + mapper = make_recursive(rename, state={"mapping": {"b": "c"}}) + + renamed = mapper(e1) + assert renamed == make_expr(dt, "a", "c") + + e2 = make_expr(dt, "a", "a") + mapper = make_recursive(rename, state={"mapping": {"b": "c"}}) + + renamed = mapper(e2) + assert renamed == make_expr(dt, "a", "a") + mapper = make_recursive(rename, state={"mapping": {"a": "c"}}) + + renamed = mapper(e2) + assert renamed == make_expr(dt, "c", "c") + + +def test_rewrite_ir_node(): + df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]}) + q = df.group_by("a").agg(pl.col("b").sum()).sort("b") + + orig = Translator(q._ldf.visit(), pl.GPUEngine()).translate_ir() + + new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]}) + + def replace_df(node, rec): + if isinstance(node, ir.DataFrameScan): + return ir.DataFrameScan( + node.schema, new_df._df, node.projection, node.predicate + ) + return reuse_if_unchanged(node, rec) + + mapper = CachingVisitor(replace_df) + + new = mapper(orig) + + result = new.evaluate(cache={}).to_polars() + + expect = pl.DataFrame({"a": [2, 1], "b": [-4, -3]}) + + assert_frame_equal(result, expect) + + +def test_rewrite_scan_node(tmp_path): + left = pl.LazyFrame({"a": [1, 2, 3], "b": [1, 3, 4]}) + right = pl.DataFrame({"a": [1, 4, 2], "c": [1, 2, 3]}) + + right.write_parquet(tmp_path / "right.pq") + + right_s = pl.scan_parquet(tmp_path / "right.pq") + + q = left.join(right_s, on="a", how="inner") + + def replace_scan(node, rec): + if isinstance(node, ir.Scan): + return ir.DataFrameScan( + node.schema, right._df, node.with_columns, node.predicate + ) + return reuse_if_unchanged(node, rec) + + mapper = CachingVisitor(replace_scan) + + orig = Translator(q._ldf.visit(), pl.GPUEngine()).translate_ir() + new = mapper(orig) + + result = new.evaluate(cache={}).to_polars() + + expect = q.collect() + + assert_frame_equal(result, expect, check_row_order=False) + + +def test_rewrite_names_and_ops(): + df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]}) + + q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d") + + # We will replace a -> d, c -> d, and addition with multiplication + expect = ( + df.select( + (pl.col("d") - (pl.col("b") * pl.col("d") * 2)).alias("a"), pl.col("d") + ) + .sort("d") + .collect() + ) + + qir = Translator(q._ldf.visit(), pl.GPUEngine()).translate_ir() + + @singledispatch + def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr: + raise NotImplementedError("Unhandled") + + @_transform.register + def _(e: expr.Col, fn: ExprTransformer): + mapping = fn.state["mapping"] + if e.name in mapping: + return type(e)(e.dtype, mapping[e.name]) + return e + + @_transform.register + def _(e: expr.BinOp, fn: ExprTransformer): + if e.op == plc.binaryop.BinaryOperator.ADD: + return type(e)( + e.dtype, plc.binaryop.BinaryOperator.MUL, *map(fn, e.children) + ) + return reuse_if_unchanged(e, fn) + + _transform.register(expr.Expr)(reuse_if_unchanged) + + @singledispatch + def _rewrite(node: ir.IR, fn: IRTransformer) -> ir.IR: + raise NotImplementedError("Unhandled") + + @_rewrite.register + def _(node: ir.Select, fn: IRTransformer): + expr_mapper = fn.state["expr_mapper"] + return type(node)( + node.schema, + [expr.NamedExpr(e.name, expr_mapper(e.value)) for e in node.exprs], + node.should_broadcast, + fn(node.children[0]), + ) + + _rewrite.register(ir.IR)(reuse_if_unchanged) + + rewriter = CachingVisitor( + _rewrite, + state={ + "expr_mapper": CachingVisitor( + _transform, state={"mapping": {"a": "d", "c": "d"}} + ) + }, + ) + + new_ir = rewriter(qir) + + got = new_ir.evaluate(cache={}).to_polars() + + assert_frame_equal(expect, got) diff --git a/python/cudf_polars/tests/experimental/__init__.py b/python/cudf_polars/tests/experimental/__init__.py new file mode 100644 index 00000000000..db71916f3c4 --- /dev/null +++ b/python/cudf_polars/tests/experimental/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Testing experimental features.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/experimental/test_dask_serialize.py b/python/cudf_polars/tests/experimental/test_dask_serialize.py new file mode 100644 index 00000000000..e556b7e4445 --- /dev/null +++ b/python/cudf_polars/tests/experimental/test_dask_serialize.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pyarrow as pa +import pytest +from distributed.protocol import deserialize, serialize + +from polars.testing.asserts import assert_frame_equal + +import pylibcudf as plc + +from cudf_polars.containers import DataFrame +from cudf_polars.experimental.dask_serialize import register + +# Must register serializers before running tests +register() + + +@pytest.mark.parametrize( + "arrow_tbl", + [ + pa.table([]), + pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), + pa.table({"a": [1, 2, 3]}), + pa.table({"a": [1], "b": [2], "c": [3]}), + pa.table({"a": ["a", "bb", "ccc"]}), + pa.table({"a": [1, 2, None], "b": [None, 3, 4]}), + ], +) +@pytest.mark.parametrize("protocol", ["cuda", "dask"]) +def test_dask_serialization_roundtrip(arrow_tbl, protocol): + plc_tbl = plc.interop.from_arrow(arrow_tbl) + df = DataFrame.from_table(plc_tbl, names=arrow_tbl.column_names) + + header, frames = serialize(df, on_error="raise", serializers=[protocol]) + res = deserialize(header, frames, deserializers=[protocol]) + + assert_frame_equal(df.to_polars(), res.to_polars()) diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 56055f4c6c2..86cb2352dcc 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -93,10 +93,10 @@ def test_bool_agg(agg, request): expr = getattr(pl.col("a"), agg)() q = df.select(expr) - assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_exact=False) -@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs) +@pytest.mark.parametrize("cum_agg", sorted(expr.UnaryFunction._supported_cum_aggs)) def test_cum_agg_reverse_unsupported(cum_agg): df = pl.LazyFrame({"a": [1, 2, 3]}) expr = getattr(pl.col("a"), cum_agg)(reverse=True) diff --git a/python/cudf_polars/tests/expressions/test_casting.py b/python/cudf_polars/tests/expressions/test_casting.py index 3e003054338..0722a0f198a 100644 --- a/python/cudf_polars/tests/expressions/test_casting.py +++ b/python/cudf_polars/tests/expressions/test_casting.py @@ -14,7 +14,7 @@ _supported_dtypes = [(pl.Int8(), pl.Int64())] _unsupported_dtypes = [ - (pl.String(), pl.Int64()), + (pl.Datetime("ns"), pl.Int64()), ] diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py index ced49bdc254..52bc4a9ac71 100644 --- a/python/cudf_polars/tests/expressions/test_literal.py +++ b/python/cudf_polars/tests/expressions/test_literal.py @@ -2,11 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import pylibcudf as plc import pytest import polars as pl +import pylibcudf as plc + from cudf_polars.testing.asserts import ( assert_gpu_result_equal, assert_ir_translation_raises, diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 8f68bbc460c..fa1ec3c19e4 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -8,7 +8,6 @@ from cudf_polars.testing.asserts import ( assert_gpu_result_equal, - assert_ir_translation_raises, ) dtypes = [ @@ -114,12 +113,3 @@ def test_binop_with_scalar(left_scalar, right_scalar): q = df.select(lop / rop) assert_gpu_result_equal(q) - - -def test_numeric_to_string_cast_fails(): - df = pl.DataFrame( - {"a": [1, 1, 2, 3, 3, 4, 1], "b": [None, 2, 3, 4, 5, 6, 7]} - ).lazy() - q = df.select(pl.col("a").cast(pl.String)) - - assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 76c7648813a..dd080f41483 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -4,12 +4,13 @@ import itertools -import pylibcudf as plc import pytest import polars as pl -from cudf_polars import translate_ir +import pylibcudf as plc + +from cudf_polars import Translator from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -67,9 +68,9 @@ def test_setsorted(descending, nulls_last, with_nulls): assert_gpu_result_equal(q) - df = translate_ir(q._ldf.visit()).evaluate(cache={}) + df = Translator(q._ldf.visit(), pl.GPUEngine()).translate_ir().evaluate(cache={}) - (a,) = df.columns + a = df.column_map["a"] assert a.is_sorted == plc.types.Sorted.YES null_order = ( diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 4f6850ac977..8d7d970eb07 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -40,6 +40,79 @@ def ldf(with_nulls): ) +@pytest.fixture(params=[pl.Int8, pl.Int16, pl.Int32, pl.Int64]) +def integer_type(request): + return request.param + + +@pytest.fixture(params=[pl.Float32, pl.Float64]) +def floating_type(request): + return request.param + + +@pytest.fixture(params=[pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64]) +def numeric_type(request): + return request.param + + +@pytest.fixture +def str_to_integer_data(with_nulls): + a = ["1", "2", "3", "4", "5", "6"] + if with_nulls: + a[4] = None + return pl.LazyFrame({"a": a}) + + +@pytest.fixture +def str_to_float_data(with_nulls): + a = [ + "1.1", + "2.2", + "3.3", + "4.4", + "5.5", + "6.6", + "inf", + "+inf", + "-inf", + "Inf", + "-Inf", + "nan", + "-1.234", + "2e2", + ] + if with_nulls: + a[4] = None + return pl.LazyFrame({"a": a}) + + +@pytest.fixture +def str_from_integer_data(with_nulls, integer_type): + a = [1, 2, 3, 4, 5, 6] + if with_nulls: + a[4] = None + return pl.LazyFrame({"a": pl.Series(a, dtype=integer_type)}) + + +@pytest.fixture +def str_from_float_data(with_nulls, floating_type): + a = [ + 1.1, + 2.2, + 3.3, + 4.4, + 5.5, + 6.6, + float("inf"), + float("+inf"), + float("-inf"), + float("nan"), + ] + if with_nulls: + a[4] = None + return pl.LazyFrame({"a": pl.Series(a, dtype=floating_type)}) + + slice_cases = [ (1, 3), (0, 3), @@ -337,3 +410,47 @@ def test_unsupported_regex_raises(pattern): q = df.select(pl.col("a").str.contains(pattern, strict=True)) assert_ir_translation_raises(q, NotImplementedError) + + +def test_string_to_integer(str_to_integer_data, integer_type): + query = str_to_integer_data.select(pl.col("a").cast(integer_type)) + assert_gpu_result_equal(query) + + +def test_string_from_integer(str_from_integer_data): + query = str_from_integer_data.select(pl.col("a").cast(pl.String)) + assert_gpu_result_equal(query) + + +def test_string_to_float(str_to_float_data, floating_type): + query = str_to_float_data.select(pl.col("a").cast(floating_type)) + assert_gpu_result_equal(query) + + +def test_string_from_float(request, str_from_float_data): + if str_from_float_data.collect_schema()["a"] == pl.Float32: + # libcudf will return a string representing the precision out to + # a certain number of hardcoded decimal places. This results in + # the fractional part being thrown away which causes discrepancies + # for certain numbers. For instance, the float32 representation of + # 1.1 is 1.100000023841858. When cast to a string, this will become + # 1.100000024. But the float64 representation of 1.1 is + # 1.1000000000000000888 which will result in libcudf truncating the + # final value to 1.1. + request.applymarker(pytest.mark.xfail(reason="libcudf truncation")) + query = str_from_float_data.select(pl.col("a").cast(pl.String)) + + # libcudf reads float('inf') -> "inf" + # but polars reads float('inf') -> "Inf" + query = query.select(pl.col("a").str.to_lowercase()) + assert_gpu_result_equal(query) + + +def test_string_to_numeric_invalid(numeric_type): + df = pl.LazyFrame({"a": ["a", "b", "c"]}) + q = df.select(pl.col("a").cast(numeric_type)) + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.ComputeError, + ) diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 3c3986be19b..52c5c9894fe 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -10,7 +10,8 @@ import rmm -from cudf_polars.dsl.ir import IR +from cudf_polars.callback import default_memory_resource +from cudf_polars.dsl.ir import DataFrameScan from cudf_polars.testing.asserts import ( assert_gpu_result_equal, assert_ir_translation_raises, @@ -18,10 +19,10 @@ def test_polars_verbose_warns(monkeypatch): - def raise_unimplemented(self): + def raise_unimplemented(self, *args): raise NotImplementedError("We don't support this") - monkeypatch.setattr(IR, "__post_init__", raise_unimplemented) + monkeypatch.setattr(DataFrameScan, "__init__", raise_unimplemented) q = pl.LazyFrame({}) # Ensure that things raise assert_ir_translation_raises(q, NotImplementedError) @@ -30,7 +31,7 @@ def raise_unimplemented(self): pytest.raises(pl.exceptions.ComputeError), pytest.warns( pl.exceptions.PerformanceWarning, - match="Query execution with GPU not supported", + match="Query execution with GPU not possible", ), ): # And ensure that collecting issues the correct warning. @@ -58,6 +59,25 @@ def test_invalid_memory_resource_raises(mr): q.collect(engine=pl.GPUEngine(memory_resource=mr)) +@pytest.mark.parametrize("disable_managed_memory", ["1", "0"]) +def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory): + q = pl.LazyFrame({"a": [1, 2, 3]}) + + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv( + "POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory + ) + result = q.collect(engine=pl.GPUEngine()) + mr = default_memory_resource(0, bool(disable_managed_memory == "1")) + if disable_managed_memory == "1": + assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor) + assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource) + else: + assert isinstance(mr, rmm.mr.CudaAsyncMemoryResource) + monkeycontext.delenv("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY") + assert_frame_equal(q.collect(), result) + + def test_explicit_device_zero(): q = pl.LazyFrame({"a": [1, 2, 3]}) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 74bf8b9e4e2..1e8246496cd 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -4,6 +4,7 @@ import itertools +import numpy as np import pytest import polars as pl @@ -191,3 +192,24 @@ def test_groupby_literal_in_agg(df, key, expr): def test_groupby_unary_non_pointwise_raises(df, expr): q = df.group_by("key1").agg(expr) assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("nrows", [30, 300, 300_000]) +@pytest.mark.parametrize("nkeys", [1, 2, 4]) +def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [np.random.randint(100, size=nrows) for _ in key_names] + value = np.random.randint(-100, 100, size=nrows) + df = pl.DataFrame(dict(zip(key_names, key_values, strict=True), value=value)) + if with_nulls: + df = df.with_columns( + *( + pl.when(pl.col(name) == 1) + .then(None) + .otherwise(pl.col(name)) + .alias(name) + for name in key_names + ) + ) + q = df.lazy().group_by(key_names, maintain_order=True).agg(pl.col("value").sum()) + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 7d9ec98db97..2fcbbf21f1c 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -2,14 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from contextlib import nullcontext + import pytest import polars as pl +from polars.testing import assert_frame_equal from cudf_polars.testing.asserts import ( assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils.versions import POLARS_VERSION_LT_112, POLARS_VERSION_LT_113 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"]) @@ -22,6 +26,11 @@ def how(request): return request.param +@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)]) +def zlice(request): + return request.param + + @pytest.fixture def left(): return pl.LazyFrame( @@ -37,8 +46,9 @@ def left(): def right(): return pl.LazyFrame( { - "a": [1, 4, 3, 7, None, None], - "c": [2, 3, 4, 5, 6, 7], + "a": [1, 4, 3, 7, None, None, 1], + "c": [2, 3, 4, 5, 6, 7, 8], + "d": [6, None, 7, 8, -1, 2, 4], } ) @@ -70,19 +80,72 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr): query = left.join( right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True ) - assert_gpu_result_equal(query, check_row_order=False) + assert_gpu_result_equal(query, check_row_order=how == "left") -def test_cross_join(left, right): +def test_left_join_with_slice(left, right, join_nulls, zlice): + q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True) + ctx = nullcontext() + if zlice is not None: + q_expect = q.collect().slice(*zlice) + q = q.slice(*zlice) + if POLARS_VERSION_LT_112 and (zlice == (1, 5) or zlice == (0, 2)): + # https://github.com/pola-rs/polars/issues/19403 + # https://github.com/pola-rs/polars/issues/19405 + ctx = pytest.raises(AssertionError) + assert_frame_equal( + q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True)) + ) + + with ctx: + assert_gpu_result_equal(q) + + +def test_cross_join(left, right, zlice): q = left.join(right, how="cross") + if zlice is not None: + q = q.slice(*zlice) assert_gpu_result_equal(q) @pytest.mark.parametrize( - "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))] + "left_on,right_on", + [ + (pl.col("a"), pl.lit(2, dtype=pl.Int64)), + (pl.lit(2, dtype=pl.Int64), pl.col("a")), + ], ) def test_join_literal_key_unsupported(left, right, left_on, right_on): q = left.join(right, left_on=left_on, right_on=right_on, how="inner") assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize( + "conditions", + [ + [pl.col("a") < pl.col("a_right")], + [pl.col("a_right") <= pl.col("a") * 2], + [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")], + [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")], + pytest.param( + [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], + marks=pytest.mark.xfail( + POLARS_VERSION_LT_113, + reason="https://github.com/pola-rs/polars/issues/19597", + ), + ), + ], +) +def test_join_where(left, right, conditions, zlice): + q = left.join_where(right, *conditions) + + assert_gpu_result_equal(q, check_row_order=False) + + if zlice is not None: + q_len = q.slice(*zlice).select(pl.len()) + # Can't compare result, since row order is not guaranteed and + # therefore we only check the length + + assert_gpu_result_equal(q_len) diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index e895f27f637..63aa1c573a9 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -93,16 +93,3 @@ def test_unpivot_defaults(): ) q = df.unpivot(index="d") assert_gpu_result_equal(q) - - -def test_unpivot_unsupported_cast_raises(): - df = pl.LazyFrame( - { - "a": ["x", "y", "z"], - "b": pl.Series([1, 3, 5], dtype=pl.Int16), - } - ) - - q = df.unpivot(["a", "b"]) - - assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_parquet_filters.py b/python/cudf_polars/tests/test_parquet_filters.py new file mode 100644 index 00000000000..5c5f927e4f4 --- /dev/null +++ b/python/cudf_polars/tests/test_parquet_filters.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(scope="module") +def df(): + return pl.DataFrame( + { + "c": ["a", "b", "c", "d", "e", "f"], + "a": [1, 2, 3, None, 4, 5], + "b": pl.Series([None, None, 3, float("inf"), 4, 0], dtype=pl.Float64), + "d": [-1, 2, -3, None, 4, -5], + } + ) + + +@pytest.fixture(scope="module") +def pq_file(tmp_path_factory, df): + tmp_path = tmp_path_factory.mktemp("parquet_filter") + df.write_parquet(tmp_path / "tmp.pq", row_group_size=3) + return pl.scan_parquet(tmp_path / "tmp.pq") + + +@pytest.mark.parametrize( + "expr", + [ + pl.col("a").is_in([0, 1]), + pl.col("a").is_between(0, 2), + (pl.col("a") < 2).not_(), + pl.lit(2) > pl.col("a"), + pl.lit(2) >= pl.col("a"), + pl.lit(2) < pl.col("a"), + pl.lit(2) <= pl.col("a"), + pl.lit(0) == pl.col("a"), + pl.lit(1) != pl.col("a"), + pl.col("a") == pl.col("d"), + (pl.col("b") < pl.lit(2, dtype=pl.Float64).sqrt()), + (pl.col("a") >= pl.lit(2)) & (pl.col("b") > 0), + pl.col("b").is_finite(), + pl.col("a").is_null(), + pl.col("a").is_not_null(), + pl.col("a").abs().is_between(0, 2), + pl.col("a").ne_missing(pl.lit(None, dtype=pl.Int64)), + ], +) +@pytest.mark.parametrize("selection", [["c", "b"], ["a"], ["a", "c"], ["b"], "c"]) +@pytest.mark.parametrize("chunked", [False, True], ids=["unchunked", "chunked"]) +def test_scan_by_hand(expr, selection, pq_file, chunked): + q = pq_file.filter(expr).select(*selection) + assert_gpu_result_equal( + q, engine=pl.GPUEngine(raise_on_fail=True, parquet_options={"chunked": chunked}) + ) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 792b136acd8..9c58a24c065 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -13,18 +13,20 @@ assert_ir_translation_raises, ) +NO_CHUNK_ENGINE = pl.GPUEngine(raise_on_fail=True, parquet_options={"chunked": False}) + @pytest.fixture( params=[(None, None), ("row-index", 0), ("index", 10)], - ids=["no-row-index", "zero-offset-row-index", "offset-row-index"], + ids=["no_row_index", "zero_offset_row_index", "offset_row_index"], ) def row_index(request): return request.param @pytest.fixture( - params=[None, 2, 3], - ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"], + params=[None, 3], + ids=["all_rows", "some_rows"], ) def n_rows(request): return request.param @@ -51,21 +53,15 @@ def columns(request, row_index): @pytest.fixture( - params=[None, pl.col("c").is_not_null()], ids=["no-mask", "c-is-not-null"] + params=[None, pl.col("c").is_not_null()], ids=["no_mask", "c_is_not_null"] ) def mask(request): return request.param @pytest.fixture( - params=[ - None, - (1, 1), - ], - ids=[ - "no-slice", - "slice-second", - ], + params=[None, (1, 1)], + ids=["no_slice", "slice_second"], ) def slice(request): # For use in testing that we handle @@ -92,12 +88,16 @@ def make_source(df, path, format): ("csv", pl.scan_csv), ("ndjson", pl.scan_ndjson), ("parquet", pl.scan_parquet), + ("chunked_parquet", pl.scan_parquet), ], ) def test_scan( tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, slice, request ): name, offset = row_index + is_chunked = format == "chunked_parquet" + if is_chunked: + format = "parquet" make_source(df, tmp_path / "file", format) request.applymarker( pytest.mark.xfail( @@ -111,13 +111,15 @@ def test_scan( row_index_offset=offset, n_rows=n_rows, ) + engine = pl.GPUEngine(raise_on_fail=True, parquet_options={"chunked": is_chunked}) + if slice is not None: q = q.slice(*slice) if mask is not None: q = q.filter(mask) if columns is not None: q = q.select(*columns) - assert_gpu_result_equal(q) + assert_gpu_result_equal(q, engine=engine) def test_negative_slice_pushdown_raises(tmp_path): @@ -153,7 +155,7 @@ def test_scan_row_index_projected_out(tmp_path): q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a")) - assert_gpu_result_equal(q) + assert_gpu_result_equal(q, engine=NO_CHUNK_ENGINE) def test_scan_csv_column_renames_projection_schema(tmp_path): @@ -323,6 +325,56 @@ def test_scan_parquet_only_row_index_raises(df, tmp_path): assert_ir_translation_raises(q, NotImplementedError) +@pytest.fixture( + scope="module", params=["no_slice", "skip_to_end", "skip_partial", "partial"] +) +def chunked_slice(request): + return request.param + + +@pytest.fixture(scope="module") +def large_df(df, tmpdir_factory, chunked_slice): + # Something big enough that we get more than a single chunk, + # empirically determined + df = pl.concat([df] * 1000) + df = pl.concat([df] * 10) + df = pl.concat([df] * 10) + path = str(tmpdir_factory.mktemp("data") / "large.pq") + make_source(df, path, "parquet") + n_rows = len(df) + q = pl.scan_parquet(path) + if chunked_slice == "no_slice": + return q + elif chunked_slice == "skip_to_end": + return q.slice(int(n_rows * 0.6), n_rows) + elif chunked_slice == "skip_partial": + return q.slice(int(n_rows * 0.6), int(n_rows * 0.2)) + else: + return q.slice(0, int(n_rows * 0.6)) + + +@pytest.mark.parametrize( + "chunk_read_limit", [0, 1, 2, 4, 8, 16], ids=lambda x: f"chunk_{x}" +) +@pytest.mark.parametrize( + "pass_read_limit", [0, 1, 2, 4, 8, 16], ids=lambda x: f"pass_{x}" +) +def test_scan_parquet_chunked( + request, chunked_slice, large_df, chunk_read_limit, pass_read_limit +): + assert_gpu_result_equal( + large_df, + engine=pl.GPUEngine( + raise_on_fail=True, + parquet_options={ + "chunked": True, + "chunk_read_limit": chunk_read_limit, + "pass_read_limit": pass_read_limit, + }, + ), + ) + + def test_scan_hf_url_raises(): q = pl.scan_csv("hf://datasets/scikit-learn/iris/Iris.csv") assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py index 35aaef44e1f..3b3b4f0f8db 100644 --- a/python/cudf_polars/tests/utils/test_broadcast.py +++ b/python/cudf_polars/tests/utils/test_broadcast.py @@ -3,37 +3,39 @@ from __future__ import annotations -import pylibcudf as plc import pytest -from cudf_polars.containers import NamedColumn +import pylibcudf as plc + +from cudf_polars.containers import Column from cudf_polars.dsl.ir import broadcast @pytest.mark.parametrize("target", [4, None]) def test_broadcast_all_scalar(target): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] result = broadcast(*columns, target_length=target) expected = 1 if target is None else target + assert [c.name for c in result] == [f"col{i}" for i in range(3)] assert all(column.obj.size() == expected for column in result) def test_invalid_target_length(): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] @@ -43,11 +45,11 @@ def test_invalid_target_length(): def test_broadcast_mismatching_column_lengths(): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] @@ -58,16 +60,17 @@ def test_broadcast_mismatching_column_lengths(): @pytest.mark.parametrize("nrows", [0, 5]) def test_broadcast_with_scalars(nrows): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), nrows if i == 0 else 1, plc.MaskState.ALL_VALID, ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] result = broadcast(*columns) + assert [c.name for c in result] == [f"col{i}" for i in range(3)] assert all(column.obj.size() == nrows for column in result) diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py index bbdb4faa256..f63c2079e04 100644 --- a/python/cudf_polars/tests/utils/test_dtypes.py +++ b/python/cudf_polars/tests/utils/test_dtypes.py @@ -7,7 +7,20 @@ import polars as pl -from cudf_polars.utils.dtypes import from_polars +import pylibcudf as plc + +from cudf_polars.utils.dtypes import from_polars, is_order_preserving_cast + +INT8 = plc.DataType(plc.TypeId.INT8) +INT16 = plc.DataType(plc.TypeId.INT16) +INT32 = plc.DataType(plc.TypeId.INT32) +INT64 = plc.DataType(plc.TypeId.INT64) +UINT8 = plc.DataType(plc.TypeId.UINT8) +UINT16 = plc.DataType(plc.TypeId.UINT16) +UINT32 = plc.DataType(plc.TypeId.UINT32) +UINT64 = plc.DataType(plc.TypeId.UINT64) +FLOAT32 = plc.DataType(plc.TypeId.FLOAT32) +FLOAT64 = plc.DataType(plc.TypeId.FLOAT64) @pytest.mark.parametrize( @@ -30,3 +43,19 @@ def test_unhandled_dtype_conversion_raises(pltype): with pytest.raises(NotImplementedError): _ = from_polars(pltype) + + +def test_is_order_preserving_cast(): + assert is_order_preserving_cast(INT8, INT8) # Same type + assert is_order_preserving_cast(INT8, INT16) # Smaller type + assert is_order_preserving_cast(INT8, FLOAT32) # Int to large enough float + assert is_order_preserving_cast(UINT8, UINT16) # Unsigned to larger unsigned + assert is_order_preserving_cast(UINT8, FLOAT32) # Unsigned to large enough float + assert is_order_preserving_cast(FLOAT32, FLOAT64) # Float to larger float + assert is_order_preserving_cast(INT64, FLOAT32) # Int any float + assert is_order_preserving_cast(FLOAT32, INT32) # Float to undersized int + assert is_order_preserving_cast(FLOAT32, INT64) # float to large int + + assert not is_order_preserving_cast(INT16, INT8) # Bigger type + assert not is_order_preserving_cast(INT8, UINT8) # Different signedness + assert not is_order_preserving_cast(FLOAT64, FLOAT32) # Smaller float diff --git a/python/custreamz/README.md b/python/custreamz/README.md index 8da17ef09dc..e81fc35c544 100644 --- a/python/custreamz/README.md +++ b/python/custreamz/README.md @@ -26,7 +26,7 @@ tips_df = consumer.read_gdf(topic="custreamz_tips", partition=0, start=0, end=10000, - message_format="CSV") + message_format="csv") print(tips_df.head()) tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill'] * 100 diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py index 0def0ba746e..166b7d98592 100644 --- a/python/custreamz/custreamz/kafka.py +++ b/python/custreamz/custreamz/kafka.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import confluent_kafka as ck from cudf_kafka._lib.kafka import KafkaDatasource @@ -151,9 +151,14 @@ def read_gdf( "parquet": cudf.io.read_parquet, } - result = cudf_readers[message_format]( - kafka_datasource, engine="cudf", lines=True - ) + if message_format == "json": + result = cudf_readers[message_format]( + kafka_datasource, engine="cudf", lines=True + ) + else: + result = cudf_readers[message_format]( + kafka_datasource, engine="cudf" + ) # Close up the cudf datasource instance # TODO: Ideally the C++ destructor should handle the @@ -288,4 +293,4 @@ def poll(self, timeout=None): (default: infinite (None translated into -1 in the library)). (Seconds) """ - return self.ck.poll(timeout) + return self.ck_consumer.poll(timeout) diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py index 1cda9b71387..c5135bc6414 100644 --- a/python/custreamz/custreamz/tests/conftest.py +++ b/python/custreamz/custreamz/tests/conftest.py @@ -2,6 +2,7 @@ import socket import pytest + from custreamz import kafka diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index bae4b051cae..8c0130d2818 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -377,24 +377,16 @@ def test_setitem_overwrites(stream): [ ({}, "sum"), ({}, "mean"), - pytest.param({}, "min"), + ({}, "min"), pytest.param( {}, "median", marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), ), - pytest.param({}, "max"), - pytest.param( - {}, - "var", - marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), - ), - pytest.param({}, "count"), - pytest.param( - {"ddof": 0}, - "std", - marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), - ), + ({}, "max"), + ({}, "var"), + ({}, "count"), + ({"ddof": 0}, "std"), pytest.param( {"quantile": 0.5}, "quantile", diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 0b98af95ecd..dd67a019c77 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -20,8 +20,8 @@ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ "confluent-kafka>=2.5.0,<2.6.0a0", - "cudf==24.10.*", - "cudf_kafka==24.10.*", + "cudf==24.12.*,>=0.0.0a0", + "cudf_kafka==24.12.*,>=0.0.0a0", "streamz", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -65,57 +65,43 @@ include = [ ] exclude = ["*tests*"] -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "cudf", - "dask_cudf", -] -known_first_party = [ - "streamz", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", ] +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["streamz"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "cudf", "dask_cudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] + [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", "ignore:unclosed 2024.2.0, we can silence the loud deprecation -# warning before importing `dask.dataframe` (this won't -# do anything for dask==2024.2.0) -config.set({"dataframe.query-planning-warning": False}) +import warnings +from importlib import import_module +from dask import config import dask.dataframe as dd -from dask.dataframe import from_delayed +from dask.dataframe import from_delayed # noqa: E402 -import cudf +import cudf # noqa: E402 -from . import backends -from ._version import __git_commit__, __version__ -from .core import concat, from_cudf, from_dask_dataframe -from .expr import QUERY_PLANNING_ON +from . import backends # noqa: E402, F401 +from ._version import __git_commit__, __version__ # noqa: E402, F401 +from .core import concat, from_cudf, DataFrame, Index, Series # noqa: F401 + +QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED def read_csv(*args, **kwargs): @@ -38,26 +36,44 @@ def read_parquet(*args, **kwargs): return dd.read_parquet(*args, **kwargs) -def raise_not_implemented_error(attr_name): +def _deprecated_api(old_api, new_api=None, rec=None): def inner_func(*args, **kwargs): + if new_api: + # Use alternative + msg = f"{old_api} is now deprecated. " + msg += rec or f"Please use {new_api} instead." + warnings.warn(msg, FutureWarning) + new_attr = new_api.split(".") + module = import_module(".".join(new_attr[:-1])) + return getattr(module, new_attr[-1])(*args, **kwargs) + + # No alternative - raise an error raise NotImplementedError( - f"Top-level {attr_name} API is not available for dask-expr." + f"{old_api} is no longer supported. " + (rec or "") ) return inner_func if QUERY_PLANNING_ON: - from .expr._collection import DataFrame, Index, Series + from ._expr.expr import _patch_dask_expr + from . import io # noqa: F401 - groupby_agg = raise_not_implemented_error("groupby_agg") + groupby_agg = _deprecated_api("dask_cudf.groupby_agg") read_text = DataFrame.read_text - to_orc = raise_not_implemented_error("to_orc") + _patch_dask_expr() else: - from .core import DataFrame, Index, Series - from .groupby import groupby_agg - from .io import read_text, to_orc + from ._legacy.groupby import groupby_agg # noqa: F401 + from ._legacy.io import read_text # noqa: F401 + from . import io # noqa: F401 + + +to_orc = _deprecated_api( + "dask_cudf.to_orc", + new_api="dask_cudf._legacy.io.to_orc", + rec="Please use DataFrame.to_orc instead.", +) __all__ = [ @@ -65,7 +81,6 @@ def inner_func(*args, **kwargs): "Series", "Index", "from_cudf", - "from_dask_dataframe", "concat", "from_delayed", ] diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/dask_cudf/dask_cudf/_expr/__init__.py similarity index 56% rename from python/cudf_kafka/cudf_kafka/tests/pytest.ini rename to python/dask_cudf/dask_cudf/_expr/__init__.py index 7b0a9f29fb1..3c827d4ff59 100644 --- a/python/cudf_kafka/cudf_kafka/tests/pytest.ini +++ b/python/dask_cudf/dask_cudf/_expr/__init__.py @@ -1,4 +1 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/_expr/accessors.py similarity index 100% rename from python/dask_cudf/dask_cudf/accessors.py rename to python/dask_cudf/dask_cudf/_expr/accessors.py diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py similarity index 88% rename from python/dask_cudf/dask_cudf/expr/_collection.py rename to python/dask_cudf/dask_cudf/_expr/collection.py index 907abaa2bfc..fdf7d8630e9 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -34,22 +34,6 @@ class CudfFrameBase(FrameBase): - def to_dask_dataframe(self, **kwargs): - """Create a dask.dataframe object from a dask_cudf object - - WARNING: This API is deprecated, and may not work properly. - Please use `*.to_backend("pandas")` to convert the - underlying data to pandas. - """ - - warnings.warn( - "The `to_dask_dataframe` API is now deprecated. " - "Please use `*.to_backend('pandas')` instead.", - FutureWarning, - ) - - return self.to_backend("pandas", **kwargs) - def _prepare_cov_corr(self, min_periods, numeric_only): # Upstream version of this method sets min_periods # to 2 by default (which is not supported by cudf) @@ -94,7 +78,7 @@ def var( def rename_axis( self, mapper=no_default, index=no_default, columns=no_default, axis=0 ): - from dask_cudf.expr._expr import RenameAxisCudf + from dask_cudf._expr.expr import RenameAxisCudf return new_collection( RenameAxisCudf( @@ -136,7 +120,7 @@ def groupby( dropna=None, **kwargs, ): - from dask_cudf.expr._groupby import GroupBy + from dask_cudf._expr.groupby import GroupBy if isinstance(by, FrameBase) and not isinstance(by, DXSeries): raise ValueError( @@ -169,13 +153,16 @@ def groupby( ) def to_orc(self, *args, **kwargs): - return self.to_legacy_dataframe().to_orc(*args, **kwargs) + from dask_cudf._legacy.io import to_orc + + return to_orc(self, *args, **kwargs) + # return self.to_legacy_dataframe().to_orc(*args, **kwargs) @staticmethod def read_text(*args, **kwargs): from dask_expr import from_legacy_dataframe - from dask_cudf.io.text import read_text as legacy_read_text + from dask_cudf._legacy.io.text import read_text as legacy_read_text ddf = legacy_read_text(*args, **kwargs) return from_legacy_dataframe(ddf) @@ -183,19 +170,19 @@ def read_text(*args, **kwargs): class Series(DXSeries, CudfFrameBase): def groupby(self, by, **kwargs): - from dask_cudf.expr._groupby import SeriesGroupBy + from dask_cudf._expr.groupby import SeriesGroupBy return SeriesGroupBy(self, by, **kwargs) @cached_property def list(self): - from dask_cudf.accessors import ListMethods + from dask_cudf._expr.accessors import ListMethods return ListMethods(self) @cached_property def struct(self): - from dask_cudf.accessors import StructMethods + from dask_cudf._expr.accessors import StructMethods return StructMethods(self) diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py similarity index 64% rename from python/dask_cudf/dask_cudf/expr/_expr.py rename to python/dask_cudf/dask_cudf/_expr/expr.py index af83a01da98..8b91e53604c 100644 --- a/python/dask_cudf/dask_cudf/expr/_expr.py +++ b/python/dask_cudf/dask_cudf/_expr/expr.py @@ -2,15 +2,16 @@ import functools import dask_expr._shuffle as _shuffle_module -import pandas as pd from dask_expr import new_collection from dask_expr._cumulative import CumulativeBlockwise from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var -from dask_expr.io.io import FusedParquetIO -from dask_expr.io.parquet import ReadParquetPyarrowFS -from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty +from dask.dataframe.core import ( + is_dataframe_like, + make_meta, + meta_nonempty, +) from dask.dataframe.dispatch import is_categorical_dtype from dask.typing import no_default @@ -21,92 +22,6 @@ ## -class CudfFusedParquetIO(FusedParquetIO): - @staticmethod - def _load_multiple_files( - frag_filters, - columns, - schema, - *to_pandas_args, - ): - import pyarrow as pa - - from dask.base import apply, tokenize - from dask.threaded import get - - token = tokenize(frag_filters, columns, schema) - name = f"pq-file-{token}" - dsk = { - (name, i): ( - CudfReadParquetPyarrowFS._fragment_to_table, - frag, - filter, - columns, - schema, - ) - for i, (frag, filter) in enumerate(frag_filters) - } - dsk[name] = ( - apply, - pa.concat_tables, - [list(dsk.keys())], - {"promote_options": "permissive"}, - ) - return CudfReadParquetPyarrowFS._table_to_pandas( - get(dsk, name), - *to_pandas_args, - ) - - -class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS): - @functools.cached_property - def _dataset_info(self): - from dask_cudf.io.parquet import set_object_dtypes_from_pa_schema - - dataset_info = super()._dataset_info - meta_pd = dataset_info["base_meta"] - if isinstance(meta_pd, cudf.DataFrame): - return dataset_info - - # Convert to cudf - # (drop unsupported timezone information) - for k, v in meta_pd.dtypes.items(): - if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None: - meta_pd[k] = meta_pd[k].dt.tz_localize(None) - meta_cudf = cudf.from_pandas(meta_pd) - - # Re-set "object" dtypes to align with pa schema - kwargs = dataset_info.get("kwargs", {}) - set_object_dtypes_from_pa_schema( - meta_cudf, - kwargs.get("schema", None), - ) - - dataset_info["base_meta"] = meta_cudf - self.operands[type(self)._parameters.index("_dataset_info_cache")] = ( - dataset_info - ) - return dataset_info - - @staticmethod - def _table_to_pandas( - table, - index_name, - *args, - ): - df = cudf.DataFrame.from_arrow(table) - if index_name is not None: - df = df.set_index(index_name) - return df - - def _tune_up(self, parent): - if self._fusion_compression_factor >= 1: - return - if isinstance(parent, CudfFusedParquetIO): - return - return parent.substitute(self, CudfFusedParquetIO(self)) - - class RenameAxisCudf(RenameAxis): # TODO: Remove this after rename_axis is supported in cudf # (See: https://github.com/rapidsai/cudf/issues/16895) @@ -160,10 +75,6 @@ def _kwargs(self) -> dict: return {"axis": self.axis, "skipna": self.skipna} -CumulativeBlockwise._args = PatchCumulativeBlockwise._args -CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs - - # The upstream Var code uses `Series.values`, and relies on numpy # for most of the logic. Unfortunately, cudf -> cupy conversion # is not supported for data containing null values. Therefore, @@ -174,7 +85,13 @@ def _kwargs(self) -> dict: class VarCudf(Reduction): # Uses the parallel version of Welford's online algorithm (Chan '79) # (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf) - _parameters = ["frame", "skipna", "ddof", "numeric_only", "split_every"] + _parameters = [ + "frame", + "skipna", + "ddof", + "numeric_only", + "split_every", + ] _defaults = { "skipna": True, "ddof": 1, @@ -242,7 +159,12 @@ def reduction_aggregate(cls, vals, ddof=1): def _patched_var( - self, axis=0, skipna=True, ddof=1, numeric_only=False, split_every=False + self, + axis=0, + skipna=True, + ddof=1, + numeric_only=False, + split_every=False, ): if axis == 0: if hasattr(self._meta, "to_pandas"): @@ -255,9 +177,6 @@ def _patched_var( raise ValueError(f"axis={axis} not supported. Please specify 0 or 1") -Expr.var = _patched_var - - # Temporary work-around for missing cudf + categorical support # See: https://github.com/rapidsai/cudf/issues/11795 # TODO: Fix RepartitionQuantiles and remove this in cudf>24.06 @@ -277,4 +196,15 @@ def _patched_get_divisions(frame, other, *args, **kwargs): return _original_get_divisions(frame, other, *args, **kwargs) -_shuffle_module._get_divisions = _patched_get_divisions +_PATCHED = False + + +def _patch_dask_expr(): + global _PATCHED + + if not _PATCHED: + CumulativeBlockwise._args = PatchCumulativeBlockwise._args + CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs + Expr.var = _patched_var + _shuffle_module._get_divisions = _patched_get_divisions + _PATCHED = True diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py new file mode 100644 index 00000000000..0242fac6e72 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_expr/groupby.py @@ -0,0 +1,335 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import functools + +import pandas as pd +from dask_expr._collection import new_collection +from dask_expr._groupby import ( + DecomposableGroupbyAggregation, + GroupBy as DXGroupBy, + GroupbyAggregation, + SeriesGroupBy as DXSeriesGroupBy, + SingleAggregation, +) +from dask_expr._util import is_scalar + +from dask.dataframe.core import _concat +from dask.dataframe.groupby import Aggregation + +from cudf.core.groupby.groupby import _deprecate_collect + +## +## Fused groupby aggregations +## + + +def _get_spec_info(gb): + if isinstance(gb.arg, (dict, list)): + aggs = gb.arg.copy() + else: + aggs = gb.arg + + if gb._slice and not isinstance(aggs, dict): + aggs = {gb._slice: aggs} + + gb_cols = gb._by_columns + if isinstance(gb_cols, str): + gb_cols = [gb_cols] + columns = [c for c in gb.frame.columns if c not in gb_cols] + if not isinstance(aggs, dict): + aggs = {col: aggs for col in columns} + + # Assert if our output will have a MultiIndex; this will be the case if + # any value in the `aggs` dict is not a string (i.e. multiple/named + # aggregations per column) + str_cols_out = True + aggs_renames = {} + for col in aggs: + if isinstance(aggs[col], str) or callable(aggs[col]): + aggs[col] = [aggs[col]] + elif isinstance(aggs[col], dict): + str_cols_out = False + col_aggs = [] + for k, v in aggs[col].items(): + aggs_renames[col, v] = k + col_aggs.append(v) + aggs[col] = col_aggs + else: + str_cols_out = False + if col in gb_cols: + columns.append(col) + + return { + "aggs": aggs, + "columns": columns, + "str_cols_out": str_cols_out, + "aggs_renames": aggs_renames, + } + + +def _get_meta(gb): + spec_info = gb.spec_info + gb_cols = gb._by_columns + aggs = spec_info["aggs"].copy() + aggs_renames = spec_info["aggs_renames"] + if spec_info["str_cols_out"]: + # Metadata should use `str` for dict values if that is + # what the user originally specified (column names will + # be str, rather than tuples). + for col in aggs: + aggs[col] = aggs[col][0] + _meta = gb.frame._meta.groupby(gb_cols).agg(aggs) + if aggs_renames: + col_array = [] + agg_array = [] + for col, agg in _meta.columns: + col_array.append(col) + agg_array.append(aggs_renames.get((col, agg), agg)) + _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) + return _meta + + +class DecomposableCudfGroupbyAgg(DecomposableGroupbyAggregation): + sep = "___" + + @functools.cached_property + def spec_info(self): + return _get_spec_info(self) + + @functools.cached_property + def _meta(self): + return _get_meta(self) + + @property + def shuffle_by_index(self): + return False # We always group by column(s) + + @classmethod + def chunk(cls, df, *by, **kwargs): + from dask_cudf._legacy.groupby import _groupby_partition_agg + + return _groupby_partition_agg(df, **kwargs) + + @classmethod + def combine(cls, inputs, **kwargs): + from dask_cudf._legacy.groupby import _tree_node_agg + + return _tree_node_agg(_concat(inputs), **kwargs) + + @classmethod + def aggregate(cls, inputs, **kwargs): + from dask_cudf._legacy.groupby import _finalize_gb_agg + + return _finalize_gb_agg(_concat(inputs), **kwargs) + + @property + def chunk_kwargs(self) -> dict: + dropna = True if self.dropna is None else self.dropna + return { + "gb_cols": self._by_columns, + "aggs": self.spec_info["aggs"], + "columns": self.spec_info["columns"], + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + } + + @property + def combine_kwargs(self) -> dict: + dropna = True if self.dropna is None else self.dropna + return { + "gb_cols": self._by_columns, + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + } + + @property + def aggregate_kwargs(self) -> dict: + dropna = True if self.dropna is None else self.dropna + final_columns = self._slice or self._meta.columns + return { + "gb_cols": self._by_columns, + "aggs": self.spec_info["aggs"], + "columns": self.spec_info["columns"], + "final_columns": final_columns, + "as_index": True, + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + "str_cols_out": self.spec_info["str_cols_out"], + "aggs_renames": self.spec_info["aggs_renames"], + } + + +class CudfGroupbyAgg(GroupbyAggregation): + @functools.cached_property + def spec_info(self): + return _get_spec_info(self) + + @functools.cached_property + def _meta(self): + return _get_meta(self) + + def _lower(self): + return DecomposableCudfGroupbyAgg( + self.frame, + self.arg, + self.observed, + self.dropna, + self.split_every, + self.split_out, + self.sort, + self.shuffle_method, + self._slice, + *self.by, + ) + + +def _maybe_get_custom_expr( + gb, + aggs, + split_every=None, + split_out=None, + shuffle_method=None, + **kwargs, +): + from dask_cudf._legacy.groupby import ( + OPTIMIZED_AGGS, + _aggs_optimized, + _redirect_aggs, + ) + + if kwargs: + # Unsupported key-word arguments + return None + + if not hasattr(gb.obj._meta, "to_pandas"): + # Not cuDF-backed data + return None + + _aggs = _redirect_aggs(aggs) + if not _aggs_optimized(_aggs, OPTIMIZED_AGGS): + # One or more aggregations are unsupported + return None + + return CudfGroupbyAgg( + gb.obj.expr, + _aggs, + gb.observed, + gb.dropna, + split_every, + split_out, + gb.sort, + shuffle_method, + gb._slice, + *gb.by, + ) + + +## +## Custom groupby classes +## + + +class ListAgg(SingleAggregation): + @staticmethod + def groupby_chunk(arg): + return arg.agg(list) + + @staticmethod + def groupby_aggregate(arg): + gb = arg.agg(list) + if gb.ndim > 1: + for col in gb.columns: + gb[col] = gb[col].list.concat() + return gb + else: + return gb.list.concat() + + +list_aggregation = Aggregation( + name="list", + chunk=ListAgg.groupby_chunk, + agg=ListAgg.groupby_aggregate, +) + + +def _translate_arg(arg): + # Helper function to translate args so that + # they can be processed correctly by upstream + # dask & dask-expr. Right now, the only necessary + # translation is list aggregations. + if isinstance(arg, dict): + return {k: _translate_arg(v) for k, v in arg.items()} + elif isinstance(arg, list): + return [_translate_arg(x) for x in arg] + elif arg in ("collect", "list", list): + return list_aggregation + else: + return arg + + +# We define our own GroupBy classes in Dask cuDF for +# the following reasons: +# (1) We want to use a custom `aggregate` algorithm +# that performs multiple aggregations on the +# same dataframe partition at once. The upstream +# algorithm breaks distinct aggregations into +# separate tasks. +# (2) We need to work around missing `observed=False` +# support: +# https://github.com/rapidsai/cudf/issues/15173 + + +class GroupBy(DXGroupBy): + def __init__(self, *args, observed=None, **kwargs): + observed = observed if observed is not None else True + super().__init__(*args, observed=observed, **kwargs) + + def __getitem__(self, key): + if is_scalar(key): + return SeriesGroupBy( + self.obj, + by=self.by, + slice=key, + sort=self.sort, + dropna=self.dropna, + observed=self.observed, + ) + g = GroupBy( + self.obj, + by=self.by, + slice=key, + sort=self.sort, + dropna=self.dropna, + observed=self.observed, + group_keys=self.group_keys, + ) + return g + + def collect(self, **kwargs): + _deprecate_collect() + return self._single_agg(ListAgg, **kwargs) + + def aggregate(self, arg, fused=True, **kwargs): + if ( + fused + and (expr := _maybe_get_custom_expr(self, arg, **kwargs)) + is not None + ): + return new_collection(expr) + else: + return super().aggregate(_translate_arg(arg), **kwargs) + + +class SeriesGroupBy(DXSeriesGroupBy): + def __init__(self, *args, observed=None, **kwargs): + observed = observed if observed is not None else True + super().__init__(*args, observed=observed, **kwargs) + + def collect(self, **kwargs): + _deprecate_collect() + return self._single_agg(ListAgg, **kwargs) + + def aggregate(self, arg, **kwargs): + return super().aggregate(_translate_arg(arg), **kwargs) diff --git a/python/cudf_polars/tests/pytest.ini b/python/dask_cudf/dask_cudf/_legacy/__init__.py similarity index 56% rename from python/cudf_polars/tests/pytest.ini rename to python/dask_cudf/dask_cudf/_legacy/__init__.py index 7b0a9f29fb1..3c827d4ff59 100644 --- a/python/cudf_polars/tests/pytest.ini +++ b/python/dask_cudf/dask_cudf/_legacy/__init__.py @@ -1,4 +1 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py new file mode 100644 index 00000000000..d6beb775a5e --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/core.py @@ -0,0 +1,711 @@ +# Copyright (c) 2018-2024, NVIDIA CORPORATION. + +import math +import warnings + +import numpy as np +import pandas as pd +from tlz import partition_all + +from dask import dataframe as dd +from dask.base import normalize_token, tokenize +from dask.dataframe.core import ( + Scalar, + handle_out, + make_meta as dask_make_meta, + map_partitions, +) +from dask.dataframe.utils import raise_on_meta_error +from dask.highlevelgraph import HighLevelGraph +from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname + +import cudf +from cudf import _lib as libcudf +from cudf.utils.performance_tracking import _dask_cudf_performance_tracking + +from dask_cudf._expr.accessors import ListMethods, StructMethods +from dask_cudf._legacy import sorting +from dask_cudf._legacy.sorting import ( + _deprecate_shuffle_kwarg, + _get_shuffle_method, +) + + +class _Frame(dd.core._Frame, OperatorMethodMixin): + """Superclass for DataFrame and Series + + Parameters + ---------- + dsk : dict + The dask graph to compute this DataFrame + name : str + The key prefix that specifies which keys in the dask comprise this + particular DataFrame / Series + meta : cudf.DataFrame, cudf.Series, or cudf.Index + An empty cudf object with names, dtypes, and indices matching the + expected output. + divisions : tuple of index values + Values along which we partition our blocks on the index + """ + + def _is_partition_type(self, meta): + return isinstance(meta, self._partition_type) + + def __repr__(self): + s = "" + return s % (type(self).__name__, len(self.dask), self.npartitions) + + +normalize_token.register(_Frame, lambda a: a._name) + + +class DataFrame(_Frame, dd.core.DataFrame): + """ + A distributed Dask DataFrame where the backing dataframe is a + :class:`cuDF DataFrame `. + + Typically you would not construct this object directly, but rather + use one of Dask-cuDF's IO routines. + + Most operations on :doc:`Dask DataFrames ` are + supported, with many of the same caveats. + + """ + + _partition_type = cudf.DataFrame + + @_dask_cudf_performance_tracking + def _assign_column(self, k, v): + def assigner(df, k, v): + out = df.copy() + out[k] = v + return out + + meta = assigner(self._meta, k, dask_make_meta(v)) + return self.map_partitions(assigner, k, v, meta=meta) + + @_dask_cudf_performance_tracking + def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): + import uuid + + if kwargs is None: + kwargs = {} + + if cache_key is None: + cache_key = uuid.uuid4() + + def do_apply_rows(df, func, incols, outcols, kwargs): + return df.apply_rows( + func, incols, outcols, kwargs, cache_key=cache_key + ) + + meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) + return self.map_partitions( + do_apply_rows, func, incols, outcols, kwargs, meta=meta + ) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def merge(self, other, shuffle_method=None, **kwargs): + on = kwargs.pop("on", None) + if isinstance(on, tuple): + on = list(on) + return super().merge( + other, + on=on, + shuffle_method=_get_shuffle_method(shuffle_method), + **kwargs, + ) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def join(self, other, shuffle_method=None, **kwargs): + # CuDF doesn't support "right" join yet + how = kwargs.pop("how", "left") + if how == "right": + return other.join(other=self, how="left", **kwargs) + + on = kwargs.pop("on", None) + if isinstance(on, tuple): + on = list(on) + return super().join( + other, + how=how, + on=on, + shuffle_method=_get_shuffle_method(shuffle_method), + **kwargs, + ) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def set_index( + self, + other, + sorted=False, + divisions=None, + shuffle_method=None, + **kwargs, + ): + pre_sorted = sorted + del sorted + + if divisions == "quantile": + warnings.warn( + "Using divisions='quantile' is now deprecated. " + "Please raise an issue on github if you believe " + "this feature is necessary.", + FutureWarning, + ) + + if ( + divisions == "quantile" + or isinstance(divisions, (cudf.DataFrame, cudf.Series)) + or ( + isinstance(other, str) + and cudf.api.types.is_string_dtype(self[other].dtype) + ) + ): + # Let upstream-dask handle "pre-sorted" case + if pre_sorted: + return dd.shuffle.set_sorted_index( + self, other, divisions=divisions, **kwargs + ) + + by = other + if not isinstance(other, list): + by = [by] + if len(by) > 1: + raise ValueError("Dask does not support MultiIndex (yet).") + if divisions == "quantile": + divisions = None + + # Use dask_cudf's sort_values + df = self.sort_values( + by, + max_branch=kwargs.get("max_branch", None), + divisions=divisions, + set_divisions=True, + ignore_index=True, + shuffle_method=shuffle_method, + ) + + # Ignore divisions if its a dataframe + if isinstance(divisions, cudf.DataFrame): + divisions = None + + # Set index and repartition + df2 = df.map_partitions( + sorting.set_index_post, + index_name=other, + drop=kwargs.get("drop", True), + column_dtype=df.columns.dtype, + ) + npartitions = kwargs.get("npartitions", self.npartitions) + partition_size = kwargs.get("partition_size", None) + if partition_size: + return df2.repartition(partition_size=partition_size) + if not divisions and df2.npartitions != npartitions: + return df2.repartition(npartitions=npartitions) + if divisions and df2.npartitions != len(divisions) - 1: + return df2.repartition(divisions=divisions) + return df2 + + return super().set_index( + other, + sorted=pre_sorted, + shuffle_method=_get_shuffle_method(shuffle_method), + divisions=divisions, + **kwargs, + ) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def sort_values( + self, + by, + ignore_index=False, + max_branch=None, + divisions=None, + set_divisions=False, + ascending=True, + na_position="last", + sort_function=None, + sort_function_kwargs=None, + shuffle_method=None, + **kwargs, + ): + if kwargs: + raise ValueError( + f"Unsupported input arguments passed : {list(kwargs.keys())}" + ) + + df = sorting.sort_values( + self, + by, + max_branch=max_branch, + divisions=divisions, + set_divisions=set_divisions, + ignore_index=ignore_index, + ascending=ascending, + na_position=na_position, + shuffle_method=shuffle_method, + sort_function=sort_function, + sort_function_kwargs=sort_function_kwargs, + ) + + if ignore_index: + return df.reset_index(drop=True) + return df + + @_dask_cudf_performance_tracking + def to_parquet(self, path, *args, **kwargs): + """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" + from dask_cudf._legacy.io import to_parquet + + return to_parquet(self, path, *args, **kwargs) + + @_dask_cudf_performance_tracking + def to_orc(self, path, **kwargs): + """Calls dask_cudf._legacy.io.to_orc""" + from dask_cudf._legacy.io import to_orc + + return to_orc(self, path, **kwargs) + + @derived_from(pd.DataFrame) + @_dask_cudf_performance_tracking + def var( + self, + axis=None, + skipna=True, + ddof=1, + split_every=False, + dtype=None, + out=None, + naive=False, + numeric_only=False, + ): + axis = self._validate_axis(axis) + meta = self._meta_nonempty.var( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + if axis == 1: + result = map_partitions( + M.var, + self, + meta=meta, + token=self._token_prefix + "var", + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + ) + return handle_out(out, result) + elif naive: + return _naive_var(self, meta, skipna, ddof, split_every, out) + else: + return _parallel_var(self, meta, skipna, split_every, out) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def shuffle(self, *args, shuffle_method=None, **kwargs): + """Wraps dask.dataframe DataFrame.shuffle method""" + return super().shuffle( + *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs + ) + + @_dask_cudf_performance_tracking + def groupby(self, by=None, **kwargs): + from .groupby import CudfDataFrameGroupBy + + return CudfDataFrameGroupBy(self, by=by, **kwargs) + + +@_dask_cudf_performance_tracking +def sum_of_squares(x): + x = x.astype("f8")._column + outcol = libcudf.reduce.reduce("sum_of_squares", x) + return cudf.Series._from_column(outcol) + + +@_dask_cudf_performance_tracking +def var_aggregate(x2, x, n, ddof): + try: + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + result = (x2 / n) - (x / n) ** 2 + if ddof != 0: + result = result * n / (n - ddof) + return result + except ZeroDivisionError: + return np.float64(np.nan) + + +@_dask_cudf_performance_tracking +def nlargest_agg(x, **kwargs): + return cudf.concat(x).nlargest(**kwargs) + + +@_dask_cudf_performance_tracking +def nsmallest_agg(x, **kwargs): + return cudf.concat(x).nsmallest(**kwargs) + + +class Series(_Frame, dd.core.Series): + _partition_type = cudf.Series + + @_dask_cudf_performance_tracking + def count(self, split_every=False): + return reduction( + [self], + chunk=M.count, + aggregate=np.sum, + split_every=split_every, + meta="i8", + ) + + @_dask_cudf_performance_tracking + def mean(self, split_every=False): + sum = self.sum(split_every=split_every) + n = self.count(split_every=split_every) + return sum / n + + @derived_from(pd.DataFrame) + @_dask_cudf_performance_tracking + def var( + self, + axis=None, + skipna=True, + ddof=1, + split_every=False, + dtype=None, + out=None, + naive=False, + ): + axis = self._validate_axis(axis) + meta = self._meta_nonempty.var(axis=axis, skipna=skipna) + if axis == 1: + result = map_partitions( + M.var, + self, + meta=meta, + token=self._token_prefix + "var", + axis=axis, + skipna=skipna, + ddof=ddof, + ) + return handle_out(out, result) + elif naive: + return _naive_var(self, meta, skipna, ddof, split_every, out) + else: + return _parallel_var(self, meta, skipna, split_every, out) + + @_dask_cudf_performance_tracking + def groupby(self, *args, **kwargs): + from .groupby import CudfSeriesGroupBy + + return CudfSeriesGroupBy(self, *args, **kwargs) + + @property # type: ignore + @_dask_cudf_performance_tracking + def list(self): + return ListMethods(self) + + @property # type: ignore + @_dask_cudf_performance_tracking + def struct(self): + return StructMethods(self) + + +class Index(Series, dd.core.Index): + _partition_type = cudf.Index # type: ignore + + +@_dask_cudf_performance_tracking +def _naive_var(ddf, meta, skipna, ddof, split_every, out): + num = ddf._get_numeric_data() + x = 1.0 * num.sum(skipna=skipna, split_every=split_every) + x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) + n = num.count(split_every=split_every) + name = ddf._token_prefix + "var" + result = map_partitions( + var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof + ) + if isinstance(ddf, DataFrame): + result.divisions = (min(ddf.columns), max(ddf.columns)) + return handle_out(out, result) + + +@_dask_cudf_performance_tracking +def _parallel_var(ddf, meta, skipna, split_every, out): + def _local_var(x, skipna): + if skipna: + n = x.count() + avg = x.mean(skipna=skipna) + else: + # Not skipping nulls, so might as well + # avoid the full `count` operation + n = len(x) + avg = x.sum(skipna=skipna) / n + m2 = ((x - avg) ** 2).sum(skipna=skipna) + return n, avg, m2 + + def _aggregate_var(parts): + n, avg, m2 = parts[0] + for i in range(1, len(parts)): + n_a, avg_a, m2_a = n, avg, m2 + n_b, avg_b, m2_b = parts[i] + n = n_a + n_b + avg = (n_a * avg_a + n_b * avg_b) / n + delta = avg_b - avg_a + m2 = m2_a + m2_b + delta**2 * n_a * n_b / n + return n, avg, m2 + + def _finalize_var(vals): + n, _, m2 = vals + return m2 / (n - 1) + + # Build graph + nparts = ddf.npartitions + if not split_every: + split_every = nparts + name = "var-" + tokenize(skipna, split_every, out) + local_name = "local-" + name + num = ddf._get_numeric_data() + dsk = { + (local_name, n, 0): (_local_var, (num._name, n), skipna) + for n in range(nparts) + } + + # Use reduction tree + widths = [nparts] + while nparts > 1: + nparts = math.ceil(nparts / split_every) + widths.append(nparts) + height = len(widths) + for depth in range(1, height): + for group in range(widths[depth]): + p_max = widths[depth - 1] + lstart = split_every * group + lstop = min(lstart + split_every, p_max) + node_list = [ + (local_name, p, depth - 1) for p in range(lstart, lstop) + ] + dsk[(local_name, group, depth)] = (_aggregate_var, node_list) + if height == 1: + group = depth = 0 + dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) + + graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) + result = dd.core.new_dd_object(graph, name, meta, (None, None)) + if isinstance(ddf, DataFrame): + result.divisions = (min(ddf.columns), max(ddf.columns)) + return handle_out(out, result) + + +@_dask_cudf_performance_tracking +def _extract_meta(x): + """ + Extract internal cache data (``_meta``) from dask_cudf objects + """ + if isinstance(x, (Scalar, _Frame)): + return x._meta + elif isinstance(x, list): + return [_extract_meta(_x) for _x in x] + elif isinstance(x, tuple): + return tuple(_extract_meta(_x) for _x in x) + elif isinstance(x, dict): + return {k: _extract_meta(v) for k, v in x.items()} + return x + + +@_dask_cudf_performance_tracking +def _emulate(func, *args, **kwargs): + """ + Apply a function using args / kwargs. If arguments contain dd.DataFrame / + dd.Series, using internal cache (``_meta``) for calculation + """ + with raise_on_meta_error(funcname(func)): + return func(*_extract_meta(args), **_extract_meta(kwargs)) + + +@_dask_cudf_performance_tracking +def align_partitions(args): + """Align partitions between dask_cudf objects. + + Note that if all divisions are unknown, but have equal npartitions, then + they will be passed through unchanged. + """ + dfs = [df for df in args if isinstance(df, _Frame)] + if not dfs: + return args + + divisions = dfs[0].divisions + if not all(df.divisions == divisions for df in dfs): + raise NotImplementedError("Aligning mismatched partitions") + return args + + +@_dask_cudf_performance_tracking +def reduction( + args, + chunk=None, + aggregate=None, + combine=None, + meta=None, + token=None, + chunk_kwargs=None, + aggregate_kwargs=None, + combine_kwargs=None, + split_every=None, + **kwargs, +): + """Generic tree reduction operation. + + Parameters + ---------- + args : + Positional arguments for the `chunk` function. All `dask.dataframe` + objects should be partitioned and indexed equivalently. + chunk : function [block-per-arg] -> block + Function to operate on each block of data + aggregate : function list-of-blocks -> block + Function to operate on the list of results of chunk + combine : function list-of-blocks -> block, optional + Function to operate on intermediate lists of results of chunk + in a tree-reduction. If not provided, defaults to aggregate. + $META + token : str, optional + The name to use for the output keys. + chunk_kwargs : dict, optional + Keywords for the chunk function only. + aggregate_kwargs : dict, optional + Keywords for the aggregate function only. + combine_kwargs : dict, optional + Keywords for the combine function only. + split_every : int, optional + Group partitions into groups of this size while performing a + tree-reduction. If set to False, no tree-reduction will be used, + and all intermediates will be concatenated and passed to ``aggregate``. + Default is 8. + kwargs : + All remaining keywords will be passed to ``chunk``, ``aggregate``, and + ``combine``. + """ + if chunk_kwargs is None: + chunk_kwargs = dict() + if aggregate_kwargs is None: + aggregate_kwargs = dict() + chunk_kwargs.update(kwargs) + aggregate_kwargs.update(kwargs) + + if combine is None: + if combine_kwargs: + raise ValueError("`combine_kwargs` provided with no `combine`") + combine = aggregate + combine_kwargs = aggregate_kwargs + else: + if combine_kwargs is None: + combine_kwargs = dict() + combine_kwargs.update(kwargs) + + if not isinstance(args, (tuple, list)): + args = [args] + + npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} + if len(npartitions) > 1: + raise ValueError("All arguments must have same number of partitions") + npartitions = npartitions.pop() + + if split_every is None: + split_every = 8 + elif split_every is False: + split_every = npartitions + elif split_every < 2 or not isinstance(split_every, int): + raise ValueError("split_every must be an integer >= 2") + + token_key = tokenize( + token or (chunk, aggregate), + meta, + args, + chunk_kwargs, + aggregate_kwargs, + combine_kwargs, + split_every, + ) + + # Chunk + a = f"{token or funcname(chunk)}-chunk-{token_key}" + if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: + dsk = { + (a, 0, i): (chunk, key) + for i, key in enumerate(args[0].__dask_keys__()) + } + else: + dsk = { + (a, 0, i): ( + apply, + chunk, + [(x._name, i) if isinstance(x, _Frame) else x for x in args], + chunk_kwargs, + ) + for i in range(args[0].npartitions) + } + + # Combine + b = f"{token or funcname(combine)}-combine-{token_key}" + k = npartitions + depth = 0 + while k > split_every: + for part_i, inds in enumerate(partition_all(split_every, range(k))): + conc = (list, [(a, depth, i) for i in inds]) + dsk[(b, depth + 1, part_i)] = ( + (apply, combine, [conc], combine_kwargs) + if combine_kwargs + else (combine, conc) + ) + k = part_i + 1 + a = b + depth += 1 + + # Aggregate + b = f"{token or funcname(aggregate)}-agg-{token_key}" + conc = (list, [(a, depth, i) for i in range(k)]) + if aggregate_kwargs: + dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) + else: + dsk[(b, 0)] = (aggregate, conc) + + if meta is None: + meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) + meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) + meta = dask_make_meta(meta) + + graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) + return dd.core.new_dd_object(graph, b, meta, (None, None)) + + +for name in ( + "add", + "sub", + "mul", + "truediv", + "floordiv", + "mod", + "pow", + "radd", + "rsub", + "rmul", + "rtruediv", + "rfloordiv", + "rmod", + "rpow", +): + meth = getattr(cudf.DataFrame, name) + DataFrame._bind_operator_method(name, meth, original=cudf.Series) + + meth = getattr(cudf.Series, name) + Series._bind_operator_method(name, meth, original=cudf.Series) + +for name in ("lt", "gt", "le", "ge", "ne", "eq"): + meth = getattr(cudf.Series, name) + Series._bind_comparison_method(name, meth, original=cudf.Series) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py similarity index 99% rename from python/dask_cudf/dask_cudf/groupby.py rename to python/dask_cudf/dask_cudf/_legacy/groupby.py index bbbcde17b51..7e01e91476d 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/_legacy/groupby.py @@ -18,7 +18,7 @@ from cudf.core.groupby.groupby import _deprecate_collect from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from dask_cudf.sorting import _deprecate_shuffle_kwarg +from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg # aggregations that are dask-cudf optimized OPTIMIZED_AGGS = ( diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py new file mode 100644 index 00000000000..0421bd755f4 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) 2018-2024, NVIDIA CORPORATION. + +from .csv import read_csv # noqa: F401 +from .json import read_json # noqa: F401 +from .orc import read_orc, to_orc # noqa: F401 +from .text import read_text # noqa: F401 + +try: + from .parquet import read_parquet, to_parquet # noqa: F401 +except ImportError: + pass diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py new file mode 100644 index 00000000000..fa5400344f9 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/csv.py @@ -0,0 +1,222 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +import os +from glob import glob +from warnings import warn + +from fsspec.utils import infer_compression + +from dask import dataframe as dd +from dask.base import tokenize +from dask.dataframe.io.csv import make_reader +from dask.utils import apply, parse_bytes + +import cudf + + +def read_csv(path, blocksize="default", **kwargs): + """ + Read CSV files into a :class:`.DataFrame`. + + This API parallelizes the :func:`cudf:cudf.read_csv` function in + the following ways: + + It supports loading many files at once using globstrings: + + >>> import dask_cudf + >>> df = dask_cudf.read_csv("myfiles.*.csv") + + In some cases it can break up large files: + + >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") + + It can read CSV files from external resources (e.g. S3, HTTP, FTP) + + >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") + >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") + + Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and + supports many of the same keyword arguments with the same + performance guarantees. See the docstring for + :func:`cudf:cudf.read_csv` for more information on available + keyword arguments. + + Parameters + ---------- + path : str, path object, or file-like object + Either a path to a file (a str, :py:class:`pathlib.Path`, or + py._path.local.LocalPath), URL (including http, ftp, and S3 + locations), or any object with a read() method (such as + builtin :py:func:`open` file handler function or + :py:class:`~io.StringIO`). + blocksize : int or str, default "256 MiB" + The target task partition size. If ``None``, a single block + is used for each file. + **kwargs : dict + Passthrough key-word arguments that are sent to + :func:`cudf:cudf.read_csv`. + + Notes + ----- + If any of `skipfooter`/`skiprows`/`nrows` are passed, + `blocksize` will default to None. + + Examples + -------- + >>> import dask_cudf + >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) + >>> ddf.compute() + a b + 0 1 hi + 1 2 hello + 2 3 ai + + """ + + # Handle `chunksize` deprecation + if "chunksize" in kwargs: + chunksize = kwargs.pop("chunksize", "default") + warn( + "`chunksize` is deprecated and will be removed in the future. " + "Please use `blocksize` instead.", + FutureWarning, + ) + if blocksize == "default": + blocksize = chunksize + + # Set default `blocksize` + if blocksize == "default": + if ( + kwargs.get("skipfooter", 0) != 0 + or kwargs.get("skiprows", 0) != 0 + or kwargs.get("nrows", None) is not None + ): + # Cannot read in blocks if skipfooter, + # skiprows or nrows is passed. + blocksize = None + else: + blocksize = "256 MiB" + + if "://" in str(path): + func = make_reader(cudf.read_csv, "read_csv", "CSV") + return func(path, blocksize=blocksize, **kwargs) + else: + return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) + + +def _internal_read_csv(path, blocksize="256 MiB", **kwargs): + if isinstance(blocksize, str): + blocksize = parse_bytes(blocksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + name = "read-csv-" + tokenize( + path, tokenize, **kwargs + ) # TODO: get last modified time + + compression = kwargs.get("compression", "infer") + + if compression == "infer": + # Infer compression from first path by default + compression = infer_compression(filenames[0]) + + if compression and blocksize: + # compressed CSVs reading must read the entire file + kwargs.pop("byte_range", None) + warn( + "Warning %s compression does not support breaking apart files\n" + "Please ensure that each individual file can fit in memory and\n" + "use the keyword ``blocksize=None to remove this message``\n" + "Setting ``blocksize=(size of file)``" % compression + ) + blocksize = None + + if blocksize is None: + return read_csv_without_blocksize(path, **kwargs) + + # Let dask.dataframe generate meta + dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") + kwargs1 = kwargs.copy() + usecols = kwargs1.pop("usecols", None) + dtype = kwargs1.pop("dtype", None) + meta = dask_reader(filenames[0], **kwargs1)._meta + names = meta.columns + if usecols or dtype: + # Regenerate meta with original kwargs if + # `usecols` or `dtype` was specified + meta = dask_reader(filenames[0], **kwargs)._meta + + dsk = {} + i = 0 + dtypes = meta.dtypes.values + + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, blocksize): + kwargs2 = kwargs.copy() + kwargs2["byte_range"] = ( + start, + blocksize, + ) # specify which chunk of the file we care about + if start != 0: + kwargs2["names"] = names # no header in the middle of the file + kwargs2["header"] = None + dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) + + i += 1 + + divisions = [None] * (len(dsk) + 1) + return dd.core.new_dd_object(dsk, name, meta, divisions) + + +def _read_csv(fn, dtypes=None, **kwargs): + return cudf.read_csv(fn, **kwargs) + + +def read_csv_without_blocksize(path, **kwargs): + """Read entire CSV with optional compression (gzip/zip) + + Parameters + ---------- + path : str + path to files (support for glob) + """ + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + name = "read-csv-" + tokenize(path, **kwargs) + + meta_kwargs = kwargs.copy() + if "skipfooter" in meta_kwargs: + meta_kwargs.pop("skipfooter") + if "nrows" in meta_kwargs: + meta_kwargs.pop("nrows") + # Read "head" of first file (first 5 rows). + # Convert to empty df for metadata. + meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] + + graph = { + (name, i): (apply, cudf.read_csv, [fn], kwargs) + for i, fn in enumerate(filenames) + } + + divisions = [None] * (len(filenames) + 1) + + return dd.core.new_dd_object(graph, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py new file mode 100644 index 00000000000..98c5ceedb76 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/json.py @@ -0,0 +1,209 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. + +from functools import partial + +import numpy as np +from fsspec.core import get_compression, get_fs_token_paths + +import dask +from dask.utils import parse_bytes + +import cudf +from cudf.core.column import as_column +from cudf.utils.ioutils import _is_local_filesystem + +from dask_cudf.backends import _default_backend + + +def _read_json_partition( + paths, + fs=None, + include_path_column=False, + path_converter=None, + **kwargs, +): + # Transfer all data up front for remote storage + sources = ( + paths + if fs is None + else fs.cat_ranges( + paths, + [0] * len(paths), + fs.sizes(paths), + ) + ) + + if include_path_column: + # Add "path" column. + # Must iterate over sources sequentially + if not isinstance(include_path_column, str): + include_path_column = "path" + converted_paths = ( + paths + if path_converter is None + else [path_converter(path) for path in paths] + ) + dfs = [] + for i, source in enumerate(sources): + df = cudf.read_json(source, **kwargs) + df[include_path_column] = as_column( + converted_paths[i], length=len(df) + ) + dfs.append(df) + return cudf.concat(dfs) + else: + # Pass sources directly to cudf + return cudf.read_json(sources, **kwargs) + + +def read_json( + url_path, + engine="auto", + blocksize=None, + orient="records", + lines=None, + compression="infer", + aggregate_files=True, + **kwargs, +): + """Read JSON data into a :class:`.DataFrame`. + + This function wraps :func:`dask.dataframe.read_json`, and passes + ``engine=partial(cudf.read_json, engine="auto")`` by default. + + Parameters + ---------- + url_path : str, list of str + Location to read from. If a string, can include a glob character to + find a set of file names. + Supports protocol specifications such as ``"s3://"``. + engine : str or Callable, default "auto" + + If str, this value will be used as the ``engine`` argument + when :func:`cudf.read_json` is used to create each partition. + If a :obj:`~collections.abc.Callable`, this value will be used as the + underlying function used to create each partition from JSON + data. The default value is "auto", so that + ``engine=partial(cudf.read_json, engine="auto")`` will be + passed to :func:`dask.dataframe.read_json` by default. + aggregate_files : bool or int + Whether to map multiple files to each output partition. If True, + the `blocksize` argument will be used to determine the number of + files in each partition. If any one file is larger than `blocksize`, + the `aggregate_files` argument will be ignored. If an integer value + is specified, the `blocksize` argument will be ignored, and that + number of files will be mapped to each partition. Default is True. + **kwargs : + Key-word arguments to pass through to :func:`dask.dataframe.read_json`. + + Returns + ------- + :class:`.DataFrame` + + Examples + -------- + Load single file + + >>> from dask_cudf import read_json + >>> read_json('myfile.json') # doctest: +SKIP + + Load large line-delimited JSON files using partitions of approx + 256MB size + + >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP + + Load nested JSON data + + >>> read_json('myfile.json') # doctest: +SKIP + + See Also + -------- + dask.dataframe.read_json + + """ + + if lines is None: + lines = orient == "records" + if orient != "records" and lines: + raise ValueError( + 'Line-delimited JSON is only available with orient="records".' + ) + if blocksize and (orient != "records" or not lines): + raise ValueError( + "JSON file chunking only allowed for JSON-lines" + "input (orient='records', lines=True)." + ) + + inputs = [] + if aggregate_files and blocksize or int(aggregate_files) > 1: + # Attempt custom read if we are mapping multiple files + # to each output partition. Otherwise, upstream logic + # is sufficient. + + storage_options = kwargs.get("storage_options", {}) + fs, _, paths = get_fs_token_paths( + url_path, mode="rb", storage_options=storage_options + ) + if isinstance(aggregate_files, int) and aggregate_files > 1: + # Map a static file count to each partition + inputs = [ + paths[offset : offset + aggregate_files] + for offset in range(0, len(paths), aggregate_files) + ] + elif aggregate_files is True and blocksize: + # Map files dynamically (using blocksize) + file_sizes = fs.sizes(paths) # NOTE: This can be slow + blocksize = parse_bytes(blocksize) + if all([file_size <= blocksize for file_size in file_sizes]): + counts = np.unique( + np.floor(np.cumsum(file_sizes) / blocksize), + return_counts=True, + )[1] + offsets = np.concatenate([[0], counts.cumsum()]) + inputs = [ + paths[offsets[i] : offsets[i + 1]] + for i in range(len(offsets) - 1) + ] + + if inputs: + # Inputs were successfully populated. + # Use custom _read_json_partition function + # to generate each partition. + + compression = get_compression( + url_path[0] if isinstance(url_path, list) else url_path, + compression, + ) + _kwargs = dict( + orient=orient, + lines=lines, + compression=compression, + include_path_column=kwargs.get("include_path_column", False), + path_converter=kwargs.get("path_converter"), + ) + if not _is_local_filesystem(fs): + _kwargs["fs"] = fs + # TODO: Generate meta more efficiently + meta = _read_json_partition(inputs[0][:1], **_kwargs) + return dask.dataframe.from_map( + _read_json_partition, + inputs, + meta=meta, + **_kwargs, + ) + + # Fall back to dask.dataframe.read_json + return _default_backend( + dask.dataframe.read_json, + url_path, + engine=( + partial(cudf.read_json, engine=engine) + if isinstance(engine, str) + else engine + ), + blocksize=blocksize, + orient=orient, + lines=lines, + compression=compression, + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py new file mode 100644 index 00000000000..bed69f038b0 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/orc.py @@ -0,0 +1,199 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from io import BufferedWriter, IOBase + +from fsspec.core import get_fs_token_paths +from fsspec.utils import stringify_path +from pyarrow import orc as orc + +from dask import dataframe as dd +from dask.base import tokenize +from dask.dataframe.io.utils import _get_pyarrow_dtypes + +import cudf + + +def _read_orc_stripe(fs, path, stripe, columns, kwargs=None): + """Pull out specific columns from specific stripe""" + if kwargs is None: + kwargs = {} + with fs.open(path, "rb") as f: + df_stripe = cudf.read_orc( + f, stripes=[stripe], columns=columns, **kwargs + ) + return df_stripe + + +def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): + """Read ORC files into a :class:`.DataFrame`. + + Note that this function is mostly borrowed from upstream Dask. + + Parameters + ---------- + path : str or list[str] + Location of file(s), which can be a full URL with protocol specifier, + and may include glob character if a single string. + columns : None or list[str] + Columns to load. If None, loads all. + filters : None or list of tuple or list of lists of tuples + If not None, specifies a filter predicate used to filter out + row groups using statistics stored for each row group as + Parquet metadata. Row groups that do not match the given + filter predicate are not read. The predicate is expressed in + `disjunctive normal form (DNF) + `__ + like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary + boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The + list of inner predicates is interpreted as a conjunction + (AND), forming a more selective and multiple column predicate. + Finally, the outermost list combines these filters as a + disjunction (OR). Predicates may also be passed as a list of + tuples. This form is interpreted as a single conjunction. To + express OR in predicates, one must use the (preferred) + notation of list of lists of tuples. + storage_options : None or dict + Further parameters to pass to the bytes backend. + + See Also + -------- + dask.dataframe.read_orc + + Returns + ------- + dask_cudf.DataFrame + + """ + + storage_options = storage_options or {} + fs, fs_token, paths = get_fs_token_paths( + path, mode="rb", storage_options=storage_options + ) + schema = None + nstripes_per_file = [] + for path in paths: + with fs.open(path, "rb") as f: + o = orc.ORCFile(f) + if schema is None: + schema = o.schema + elif schema != o.schema: + raise ValueError( + "Incompatible schemas while parsing ORC files" + ) + nstripes_per_file.append(o.nstripes) + schema = _get_pyarrow_dtypes(schema, categories=None) + if columns is not None: + ex = set(columns) - set(schema) + if ex: + raise ValueError( + f"Requested columns ({ex}) not in schema ({set(schema)})" + ) + else: + columns = list(schema) + + with fs.open(paths[0], "rb") as f: + meta = cudf.read_orc( + f, + stripes=[0] if nstripes_per_file[0] else None, + columns=columns, + **kwargs, + ) + + name = "read-orc-" + tokenize(fs_token, path, columns, filters, **kwargs) + dsk = {} + N = 0 + for path, n in zip(paths, nstripes_per_file): + for stripe in ( + range(n) + if filters is None + else cudf.io.orc._filter_stripes(filters, path) + ): + dsk[(name, N)] = ( + _read_orc_stripe, + fs, + path, + stripe, + columns, + kwargs, + ) + N += 1 + + divisions = [None] * (len(dsk) + 1) + return dd.core.new_dd_object(dsk, name, meta, divisions) + + +def write_orc_partition(df, path, fs, filename, compression="snappy"): + full_path = fs.sep.join([path, filename]) + with fs.open(full_path, mode="wb") as out_file: + if not isinstance(out_file, IOBase): + out_file = BufferedWriter(out_file) + cudf.io.to_orc(df, out_file, compression=compression) + return full_path + + +def to_orc( + df, + path, + write_index=True, + storage_options=None, + compression="snappy", + compute=True, + **kwargs, +): + """ + Write a :class:`.DataFrame` to ORC file(s) (one file per partition). + + Parameters + ---------- + df : DataFrame + path : str or pathlib.Path + Destination directory for data. Prepend with protocol like ``s3://`` + or ``hdfs://`` for remote data. + write_index : boolean, optional + Whether or not to write the index. Defaults to True. + storage_options : None or dict + Further parameters to pass to the bytes backend. + compression : string or dict, optional + compute : bool, optional + If True (default) then the result is computed immediately. If + False then a :class:`~dask.delayed.Delayed` object is returned + for future computation. + + """ + + from dask import compute as dask_compute, delayed + + # TODO: Use upstream dask implementation once available + # (see: Dask Issue#5596) + + if hasattr(path, "name"): + path = stringify_path(path) + fs, _, _ = get_fs_token_paths( + path, mode="wb", storage_options=storage_options + ) + # Trim any protocol information from the path before forwarding + path = fs._strip_protocol(path) + + if write_index: + df = df.reset_index() + else: + # Not writing index - might as well drop it + df = df.reset_index(drop=True) + + fs.mkdirs(path, exist_ok=True) + + # Use i_offset and df.npartitions to define file-name list + filenames = ["part.%i.orc" % i for i in range(df.npartitions)] + + # write parts + dwrite = delayed(write_orc_partition) + parts = [ + dwrite(d, path, fs, filename, compression=compression) + for d, filename in zip(df.to_delayed(), filenames) + ] + + if compute: + return dask_compute(*parts) + + return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py new file mode 100644 index 00000000000..c0638e4a1c3 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -0,0 +1,514 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. +import itertools +import warnings +from functools import partial +from io import BufferedWriter, BytesIO, IOBase + +import numpy as np +import pandas as pd +from pyarrow import dataset as pa_ds, parquet as pq + +from dask import dataframe as dd +from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine + +try: + from dask.dataframe.io.parquet import ( + create_metadata_file as create_metadata_file_dd, + ) +except ImportError: + create_metadata_file_dd = None + +import cudf +from cudf.core.column import CategoricalColumn, as_column +from cudf.io import write_to_dataset +from cudf.io.parquet import _apply_post_filters, _normalize_filters +from cudf.utils.dtypes import cudf_dtype_from_pa_type + + +class CudfEngine(ArrowDatasetEngine): + @classmethod + def _create_dd_meta(cls, dataset_info, **kwargs): + # Start with pandas-version of meta + meta_pd = super()._create_dd_meta(dataset_info, **kwargs) + + # Convert to cudf + # (drop unsupported timezone information) + for k, v in meta_pd.dtypes.items(): + if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None: + meta_pd[k] = meta_pd[k].dt.tz_localize(None) + meta_cudf = cudf.from_pandas(meta_pd) + + # Re-set "object" dtypes to align with pa schema + kwargs = dataset_info.get("kwargs", {}) + set_object_dtypes_from_pa_schema( + meta_cudf, + kwargs.get("schema", None), + ) + + return meta_cudf + + @classmethod + def multi_support(cls): + # Assert that this class is CudfEngine + # and that multi-part reading is supported + return cls == CudfEngine + + @classmethod + def _read_paths( + cls, + paths, + fs, + columns=None, + row_groups=None, + filters=None, + partitions=None, + partitioning=None, + partition_keys=None, + open_file_options=None, + dataset_kwargs=None, + **kwargs, + ): + # Simplify row_groups if all None + if row_groups == [None for path in paths]: + row_groups = None + + # Make sure we read in the columns needed for row-wise + # filtering after IO. This means that one or more columns + # will be dropped almost immediately after IO. However, + # we do NEED these columns for accurate filtering. + filters = _normalize_filters(filters) + projected_columns = None + if columns and filters: + projected_columns = [c for c in columns if c is not None] + columns = sorted( + set(v[0] for v in itertools.chain.from_iterable(filters)) + | set(projected_columns) + ) + + dataset_kwargs = dataset_kwargs or {} + if partitions: + dataset_kwargs["partitioning"] = partitioning or "hive" + + # Use cudf to read in data + try: + df = cudf.read_parquet( + paths, + engine="cudf", + columns=columns, + row_groups=row_groups if row_groups else None, + dataset_kwargs=dataset_kwargs, + categorical_partitions=False, + filesystem=fs, + **kwargs, + ) + except RuntimeError as err: + # TODO: Remove try/except after null-schema issue is resolved + # (See: https://github.com/rapidsai/cudf/issues/12702) + if len(paths) > 1: + df = cudf.concat( + [ + cudf.read_parquet( + path, + engine="cudf", + columns=columns, + row_groups=row_groups[i] if row_groups else None, + dataset_kwargs=dataset_kwargs, + categorical_partitions=False, + filesystem=fs, + **kwargs, + ) + for i, path in enumerate(paths) + ] + ) + else: + raise err + + # Apply filters (if any are defined) + df = _apply_post_filters(df, filters) + + if projected_columns: + # Elements of `projected_columns` may now be in the index. + # We must filter these names from our projection + projected_columns = [ + col for col in projected_columns if col in df._column_names + ] + df = df[projected_columns] + + if partitions and partition_keys is None: + # Use `HivePartitioning` by default + ds = pa_ds.dataset( + paths, + filesystem=fs, + **dataset_kwargs, + ) + frag = next(ds.get_fragments()) + if frag: + # Extract hive-partition keys, and make sure they + # are ordered the same as they are in `partitions` + raw_keys = pa_ds._get_partition_keys(frag.partition_expression) + partition_keys = [ + (hive_part.name, raw_keys[hive_part.name]) + for hive_part in partitions + ] + + if partition_keys: + if partitions is None: + raise ValueError("Must pass partition sets") + + for i, (name, index2) in enumerate(partition_keys): + if len(partitions[i].keys): + # Build a categorical column from `codes` directly + # (since the category is often a larger dtype) + codes = as_column( + partitions[i].keys.get_loc(index2), + length=len(df), + ) + df[name] = CategoricalColumn( + data=None, + size=codes.size, + dtype=cudf.CategoricalDtype( + categories=partitions[i].keys, ordered=False + ), + offset=codes.offset, + children=(codes,), + ) + elif name not in df.columns: + # Add non-categorical partition column + df[name] = as_column(index2, length=len(df)) + + return df + + @classmethod + def read_partition( + cls, + fs, + pieces, + columns, + index, + categories=(), + partitions=(), + filters=None, + partitioning=None, + schema=None, + open_file_options=None, + **kwargs, + ): + if columns is not None: + columns = [c for c in columns] + if isinstance(index, list): + columns += index + + dataset_kwargs = kwargs.get("dataset", {}) + partitioning = partitioning or dataset_kwargs.get("partitioning", None) + if isinstance(partitioning, dict): + partitioning = pa_ds.partitioning(**partitioning) + + # Check if we are actually selecting any columns + read_columns = columns + if schema and columns: + ignored = set(schema.names) - set(columns) + if not ignored: + read_columns = None + + if not isinstance(pieces, list): + pieces = [pieces] + + # Extract supported kwargs from `kwargs` + read_kwargs = kwargs.get("read", {}) + read_kwargs.update(open_file_options or {}) + check_file_size = read_kwargs.pop("check_file_size", None) + + # Wrap reading logic in a `try` block so that we can + # inform the user that the `read_parquet` partition + # size is too large for the available memory + try: + # Assume multi-piece read + paths = [] + rgs = [] + last_partition_keys = None + dfs = [] + + for i, piece in enumerate(pieces): + (path, row_group, partition_keys) = piece + row_group = None if row_group == [None] else row_group + + # File-size check to help "protect" users from change + # to up-stream `split_row_groups` default. We only + # check the file size if this partition corresponds + # to a full file, and `check_file_size` is defined + if check_file_size and len(pieces) == 1 and row_group is None: + file_size = fs.size(path) + if file_size > check_file_size: + warnings.warn( + f"A large parquet file ({file_size}B) is being " + f"used to create a DataFrame partition in " + f"read_parquet. This may cause out of memory " + f"exceptions in operations downstream. See the " + f"notes on split_row_groups in the read_parquet " + f"documentation. Setting split_row_groups " + f"explicitly will silence this warning." + ) + + if i > 0 and partition_keys != last_partition_keys: + dfs.append( + cls._read_paths( + paths, + fs, + columns=read_columns, + row_groups=rgs if rgs else None, + filters=filters, + partitions=partitions, + partitioning=partitioning, + partition_keys=last_partition_keys, + dataset_kwargs=dataset_kwargs, + **read_kwargs, + ) + ) + paths = [] + rgs = [] + last_partition_keys = None + paths.append(path) + rgs.append( + [row_group] + if not isinstance(row_group, list) + and row_group is not None + else row_group + ) + last_partition_keys = partition_keys + + dfs.append( + cls._read_paths( + paths, + fs, + columns=read_columns, + row_groups=rgs if rgs else None, + filters=filters, + partitions=partitions, + partitioning=partitioning, + partition_keys=last_partition_keys, + dataset_kwargs=dataset_kwargs, + **read_kwargs, + ) + ) + df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] + + # Re-set "object" dtypes align with pa schema + set_object_dtypes_from_pa_schema(df, schema) + + if index and (index[0] in df.columns): + df = df.set_index(index[0]) + elif index is False and df.index.names != [None]: + # If index=False, we shouldn't have a named index + df.reset_index(inplace=True) + + except MemoryError as err: + raise MemoryError( + "Parquet data was larger than the available GPU memory!\n\n" + "See the notes on split_row_groups in the read_parquet " + "documentation.\n\n" + "Original Error: " + str(err) + ) + raise err + + return df + + @staticmethod + def write_partition( + df, + path, + fs, + filename, + partition_on, + return_metadata, + fmd=None, + compression="snappy", + index_cols=None, + **kwargs, + ): + preserve_index = False + if len(index_cols) and set(index_cols).issubset(set(df.columns)): + df.set_index(index_cols, drop=True, inplace=True) + preserve_index = True + if partition_on: + md = write_to_dataset( + df=df, + root_path=path, + compression=compression, + filename=filename, + partition_cols=partition_on, + fs=fs, + preserve_index=preserve_index, + return_metadata=return_metadata, + statistics=kwargs.get("statistics", "ROWGROUP"), + int96_timestamps=kwargs.get("int96_timestamps", False), + row_group_size_bytes=kwargs.get("row_group_size_bytes", None), + row_group_size_rows=kwargs.get("row_group_size_rows", None), + max_page_size_bytes=kwargs.get("max_page_size_bytes", None), + max_page_size_rows=kwargs.get("max_page_size_rows", None), + storage_options=kwargs.get("storage_options", None), + ) + else: + with fs.open(fs.sep.join([path, filename]), mode="wb") as out_file: + if not isinstance(out_file, IOBase): + out_file = BufferedWriter(out_file) + md = df.to_parquet( + path=out_file, + engine=kwargs.get("engine", "cudf"), + index=kwargs.get("index", None), + partition_cols=kwargs.get("partition_cols", None), + partition_file_name=kwargs.get( + "partition_file_name", None + ), + partition_offsets=kwargs.get("partition_offsets", None), + statistics=kwargs.get("statistics", "ROWGROUP"), + int96_timestamps=kwargs.get("int96_timestamps", False), + row_group_size_bytes=kwargs.get( + "row_group_size_bytes", None + ), + row_group_size_rows=kwargs.get( + "row_group_size_rows", None + ), + storage_options=kwargs.get("storage_options", None), + metadata_file_path=filename if return_metadata else None, + ) + # Return the schema needed to write the metadata + if return_metadata: + return [{"meta": md}] + else: + return [] + + @staticmethod + def write_metadata(parts, fmd, fs, path, append=False, **kwargs): + if parts: + # Aggregate metadata and write to _metadata file + metadata_path = fs.sep.join([path, "_metadata"]) + _meta = [] + if append and fmd is not None: + # Convert to bytes: + if isinstance(fmd, pq.FileMetaData): + with BytesIO() as myio: + fmd.write_metadata_file(myio) + myio.seek(0) + fmd = np.frombuffer(myio.read(), dtype="uint8") + _meta = [fmd] + _meta.extend([parts[i][0]["meta"] for i in range(len(parts))]) + _meta = ( + cudf.io.merge_parquet_filemetadata(_meta) + if len(_meta) > 1 + else _meta[0] + ) + with fs.open(metadata_path, "wb") as fil: + fil.write(memoryview(_meta)) + + @classmethod + def collect_file_metadata(cls, path, fs, file_path): + with fs.open(path, "rb") as f: + meta = pq.ParquetFile(f).metadata + if file_path: + meta.set_file_path(file_path) + with BytesIO() as myio: + meta.write_metadata_file(myio) + myio.seek(0) + meta = np.frombuffer(myio.read(), dtype="uint8") + return meta + + @classmethod + def aggregate_metadata(cls, meta_list, fs, out_path): + meta = ( + cudf.io.merge_parquet_filemetadata(meta_list) + if len(meta_list) > 1 + else meta_list[0] + ) + if out_path: + metadata_path = fs.sep.join([out_path, "_metadata"]) + with fs.open(metadata_path, "wb") as fil: + fil.write(memoryview(meta)) + return None + else: + return meta + + +def set_object_dtypes_from_pa_schema(df, schema): + # Simple utility to modify cudf DataFrame + # "object" dtypes to agree with a specific + # pyarrow schema. + if schema: + for col_name, col in df._data.items(): + if col_name is None: + # Pyarrow cannot handle `None` as a field name. + # However, this should be a simple range index that + # we can ignore anyway + continue + typ = cudf_dtype_from_pa_type(schema.field(col_name).type) + if ( + col_name in schema.names + and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype)) + and isinstance(col, cudf.core.column.StringColumn) + ): + df._data[col_name] = col.astype(typ) + + +def read_parquet(path, columns=None, **kwargs): + """ + Read parquet files into a :class:`.DataFrame`. + + Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` + to coordinate the execution of :func:`cudf.read_parquet`, and to + ultimately create a :class:`.DataFrame` collection. + + See the :func:`dask.dataframe.read_parquet` documentation for + all available options. + + Examples + -------- + >>> from dask_cudf import read_parquet + >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP + + When dealing with one or more large parquet files having an + in-memory footprint >15% device memory, the ``split_row_groups`` + argument should be used to map Parquet **row-groups** to DataFrame + partitions (instead of **files** to partitions). For example, the + following code will map each row-group to a distinct partition: + + >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP + + To map **multiple** row-groups to each partition, an integer can be + passed to ``split_row_groups`` to specify the **maximum** number of + row-groups allowed in each output partition: + + >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP + + See Also + -------- + cudf.read_parquet + dask.dataframe.read_parquet + """ + if isinstance(columns, str): + columns = [columns] + + # Set "check_file_size" option to determine whether we + # should check the parquet-file size. This check is meant + # to "protect" users from `split_row_groups` default changes + check_file_size = kwargs.pop("check_file_size", 500_000_000) + if ( + check_file_size + and ("split_row_groups" not in kwargs) + and ("chunksize" not in kwargs) + ): + # User is not specifying `split_row_groups` or `chunksize`, + # so we should warn them if/when a file is ~>0.5GB on disk. + # They can set `split_row_groups` explicitly to silence/skip + # this check + if "read" not in kwargs: + kwargs["read"] = {} + kwargs["read"]["check_file_size"] = check_file_size + + return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) + + +to_parquet = partial(dd.to_parquet, engine=CudfEngine) + +if create_metadata_file_dd is None: + create_metadata_file = create_metadata_file_dd +else: + create_metadata_file = partial(create_metadata_file_dd, engine=CudfEngine) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py new file mode 100644 index 00000000000..9cdb7c5220b --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/text.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. + +import os +from glob import glob + +import dask.dataframe as dd +from dask.base import tokenize +from dask.utils import apply, parse_bytes + +import cudf + + +def read_text(path, chunksize="256 MiB", **kwargs): + if isinstance(chunksize, str): + chunksize = parse_bytes(chunksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + name = "read-text-" + tokenize(path, tokenize, **kwargs) + + if chunksize: + dsk = {} + i = 0 + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, chunksize): + kwargs1 = kwargs.copy() + kwargs1["byte_range"] = ( + start, + chunksize, + ) # specify which chunk of the file we care about + + dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1) + i += 1 + else: + dsk = { + (name, i): (apply, cudf.read_text, [fn], kwargs) + for i, fn in enumerate(filenames) + } + + meta = cudf.Series([], dtype="O") + divisions = [None] * (len(dsk) + 1) + return dd.core.new_dd_object(dsk, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py similarity index 100% rename from python/dask_cudf/dask_cudf/sorting.py rename to python/dask_cudf/dask_cudf/_legacy/sorting.py diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index bead964a0ef..9c5d5523019 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -46,7 +46,7 @@ from cudf.api.types import is_string_dtype from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from .core import DataFrame, Index, Series +from ._legacy.core import DataFrame, Index, Series get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) get_parallel_type.register(cudf.Series, lambda _: Series) @@ -574,7 +574,7 @@ class CudfBackendEntrypoint(DataFrameBackendEntrypoint): >>> with dask.config.set({"dataframe.backend": "cudf"}): ... ddf = dd.from_dict({"a": range(10)}) >>> type(ddf) - + """ @classmethod @@ -610,7 +610,7 @@ def from_dict( @staticmethod def read_parquet(*args, engine=None, **kwargs): - from dask_cudf.io.parquet import CudfEngine + from dask_cudf._legacy.io.parquet import CudfEngine _raise_unsupported_parquet_kwargs(**kwargs) return _default_backend( @@ -622,19 +622,19 @@ def read_parquet(*args, engine=None, **kwargs): @staticmethod def read_json(*args, **kwargs): - from dask_cudf.io.json import read_json + from dask_cudf._legacy.io.json import read_json return read_json(*args, **kwargs) @staticmethod def read_orc(*args, **kwargs): - from dask_cudf.io import read_orc + from dask_cudf._legacy.io import read_orc return read_orc(*args, **kwargs) @staticmethod def read_csv(*args, **kwargs): - from dask_cudf.io import read_csv + from dask_cudf._legacy.io import read_csv return read_csv(*args, **kwargs) @@ -674,7 +674,7 @@ class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint): def to_backend(data, **kwargs): import dask_expr as dx - from dask_cudf.expr._expr import ToCudfBackend + from dask_cudf._expr.expr import ToCudfBackend return dx.new_collection(ToCudfBackend(data, kwargs)) @@ -700,140 +700,10 @@ def from_dict( ) @staticmethod - def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs): - import dask_expr as dx - import fsspec - - if ( - isinstance(filesystem, fsspec.AbstractFileSystem) - or isinstance(filesystem, str) - and filesystem.lower() == "fsspec" - ): - # Default "fsspec" filesystem - from dask_cudf.io.parquet import CudfEngine + def read_parquet(*args, **kwargs): + from dask_cudf.io.parquet import read_parquet as read_parquet_expr - _raise_unsupported_parquet_kwargs(**kwargs) - return _default_backend( - dx.read_parquet, - path, - *args, - filesystem=filesystem, - engine=CudfEngine, - **kwargs, - ) - - else: - # EXPERIMENTAL filesystem="arrow" support. - # This code path uses PyArrow for IO, which is only - # beneficial for remote storage (e.g. S3) - - from fsspec.utils import stringify_path - from pyarrow import fs as pa_fs - - # CudfReadParquetPyarrowFS requires import of distributed beforehand - # (See: https://github.com/dask/dask/issues/11352) - import distributed # noqa: F401 - from dask.core import flatten - from dask.dataframe.utils import pyarrow_strings_enabled - - from dask_cudf.expr._expr import CudfReadParquetPyarrowFS - - if args: - raise ValueError(f"Unexpected positional arguments: {args}") - - if not ( - isinstance(filesystem, pa_fs.FileSystem) - or isinstance(filesystem, str) - and filesystem.lower() in ("arrow", "pyarrow") - ): - raise ValueError(f"Unexpected filesystem value: {filesystem}.") - - if not PYARROW_GE_15: - raise NotImplementedError( - "Experimental Arrow filesystem support requires pyarrow>=15" - ) - - if not isinstance(path, str): - path = stringify_path(path) - - # Extract kwargs - columns = kwargs.pop("columns", None) - filters = kwargs.pop("filters", None) - categories = kwargs.pop("categories", None) - index = kwargs.pop("index", None) - storage_options = kwargs.pop("storage_options", None) - dtype_backend = kwargs.pop("dtype_backend", None) - calculate_divisions = kwargs.pop("calculate_divisions", False) - ignore_metadata_file = kwargs.pop("ignore_metadata_file", False) - metadata_task_size = kwargs.pop("metadata_task_size", None) - split_row_groups = kwargs.pop("split_row_groups", "infer") - blocksize = kwargs.pop("blocksize", "default") - aggregate_files = kwargs.pop("aggregate_files", None) - parquet_file_extension = kwargs.pop( - "parquet_file_extension", (".parq", ".parquet", ".pq") - ) - arrow_to_pandas = kwargs.pop("arrow_to_pandas", None) - open_file_options = kwargs.pop("open_file_options", None) - - # Validate and normalize kwargs - kwargs["dtype_backend"] = dtype_backend - if arrow_to_pandas is not None: - raise ValueError( - "arrow_to_pandas not supported for the 'cudf' backend." - ) - if open_file_options is not None: - raise ValueError( - "The open_file_options argument is no longer supported " - "by the 'cudf' backend." - ) - if filters is not None: - for filter in flatten(filters, container=list): - _, op, val = filter - if op == "in" and not isinstance(val, (set, list, tuple)): - raise TypeError( - "Value of 'in' filter must be a list, set or tuple." - ) - if metadata_task_size is not None: - raise NotImplementedError( - "metadata_task_size is not supported when using the pyarrow filesystem." - ) - if split_row_groups != "infer": - raise NotImplementedError( - "split_row_groups is not supported when using the pyarrow filesystem." - ) - if parquet_file_extension != (".parq", ".parquet", ".pq"): - raise NotImplementedError( - "parquet_file_extension is not supported when using the pyarrow filesystem." - ) - if blocksize is not None and blocksize != "default": - warnings.warn( - "blocksize is not supported when using the pyarrow filesystem." - "blocksize argument will be ignored." - ) - if aggregate_files is not None: - warnings.warn( - "aggregate_files is not supported when using the pyarrow filesystem. " - "Please use the 'dataframe.parquet.minimum-partition-size' config." - "aggregate_files argument will be ignored." - ) - - return dx.new_collection( - CudfReadParquetPyarrowFS( - path, - columns=dx._util._convert_to_list(columns), - filters=filters, - categories=categories, - index=index, - calculate_divisions=calculate_divisions, - storage_options=storage_options, - filesystem=filesystem, - ignore_metadata_file=ignore_metadata_file, - arrow_to_pandas=arrow_to_pandas, - pyarrow_strings_enabled=pyarrow_strings_enabled(), - kwargs=kwargs, - _series=isinstance(columns, str), - ) - ) + return read_parquet_expr(*args, **kwargs) @staticmethod def read_csv( @@ -862,7 +732,7 @@ def read_csv( @staticmethod def read_json(*args, **kwargs): - from dask_cudf.io.json import read_json as read_json_impl + from dask_cudf._legacy.io.json import read_json as read_json_impl return read_json_impl(*args, **kwargs) @@ -870,14 +740,7 @@ def read_json(*args, **kwargs): def read_orc(*args, **kwargs): from dask_expr import from_legacy_dataframe - from dask_cudf.io.orc import read_orc as legacy_read_orc + from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc ddf = legacy_read_orc(*args, **kwargs) return from_legacy_dataframe(ddf) - - -# Import/register cudf-specific classes for dask-expr -try: - import dask_cudf.expr # noqa: F401 -except ImportError: - pass diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 3181c8d69ec..7d6d5c05cbe 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,705 +1,25 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import math import textwrap -import warnings -import numpy as np -import pandas as pd -from tlz import partition_all - -from dask import dataframe as dd -from dask.base import normalize_token, tokenize -from dask.dataframe.core import ( - Scalar, - handle_out, - make_meta as dask_make_meta, - map_partitions, -) -from dask.dataframe.utils import raise_on_meta_error -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname +import dask.dataframe as dd +from dask.tokenize import tokenize import cudf -from cudf import _lib as libcudf from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from dask_cudf import sorting -from dask_cudf.accessors import ListMethods, StructMethods -from dask_cudf.sorting import _deprecate_shuffle_kwarg, _get_shuffle_method - - -class _Frame(dd.core._Frame, OperatorMethodMixin): - """Superclass for DataFrame and Series - - Parameters - ---------- - dsk : dict - The dask graph to compute this DataFrame - name : str - The key prefix that specifies which keys in the dask comprise this - particular DataFrame / Series - meta : cudf.DataFrame, cudf.Series, or cudf.Index - An empty cudf object with names, dtypes, and indices matching the - expected output. - divisions : tuple of index values - Values along which we partition our blocks on the index - """ - - def _is_partition_type(self, meta): - return isinstance(meta, self._partition_type) - - def __repr__(self): - s = "" - return s % (type(self).__name__, len(self.dask), self.npartitions) - - @_dask_cudf_performance_tracking - def to_dask_dataframe(self, **kwargs): - """Create a dask.dataframe object from a dask_cudf object - - WARNING: This API is deprecated, and may not work properly - when query-planning is active. Please use `*.to_backend("pandas")` - to convert the underlying data to pandas. - """ - - warnings.warn( - "The `to_dask_dataframe` API is now deprecated. " - "Please use `*.to_backend('pandas')` instead.", - FutureWarning, - ) - - return self.to_backend("pandas", **kwargs) - - -concat = dd.concat - - -normalize_token.register(_Frame, lambda a: a._name) - - -class DataFrame(_Frame, dd.core.DataFrame): - """ - A distributed Dask DataFrame where the backing dataframe is a - :class:`cuDF DataFrame `. - - Typically you would not construct this object directly, but rather - use one of Dask-cuDF's IO routines. - - Most operations on :doc:`Dask DataFrames ` are - supported, with many of the same caveats. - - """ - - _partition_type = cudf.DataFrame - - @_dask_cudf_performance_tracking - def _assign_column(self, k, v): - def assigner(df, k, v): - out = df.copy() - out[k] = v - return out - - meta = assigner(self._meta, k, dask_make_meta(v)) - return self.map_partitions(assigner, k, v, meta=meta) - - @_dask_cudf_performance_tracking - def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): - import uuid - - if kwargs is None: - kwargs = {} - - if cache_key is None: - cache_key = uuid.uuid4() - - def do_apply_rows(df, func, incols, outcols, kwargs): - return df.apply_rows( - func, incols, outcols, kwargs, cache_key=cache_key - ) - - meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) - return self.map_partitions( - do_apply_rows, func, incols, outcols, kwargs, meta=meta - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def merge(self, other, shuffle_method=None, **kwargs): - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().merge( - other, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def join(self, other, shuffle_method=None, **kwargs): - # CuDF doesn't support "right" join yet - how = kwargs.pop("how", "left") - if how == "right": - return other.join(other=self, how="left", **kwargs) - - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().join( - other, - how=how, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def set_index( - self, - other, - sorted=False, - divisions=None, - shuffle_method=None, - **kwargs, - ): - pre_sorted = sorted - del sorted - - if divisions == "quantile": - warnings.warn( - "Using divisions='quantile' is now deprecated. " - "Please raise an issue on github if you believe " - "this feature is necessary.", - FutureWarning, - ) - - if ( - divisions == "quantile" - or isinstance(divisions, (cudf.DataFrame, cudf.Series)) - or ( - isinstance(other, str) - and cudf.api.types.is_string_dtype(self[other].dtype) - ) - ): - # Let upstream-dask handle "pre-sorted" case - if pre_sorted: - return dd.shuffle.set_sorted_index( - self, other, divisions=divisions, **kwargs - ) - - by = other - if not isinstance(other, list): - by = [by] - if len(by) > 1: - raise ValueError("Dask does not support MultiIndex (yet).") - if divisions == "quantile": - divisions = None - - # Use dask_cudf's sort_values - df = self.sort_values( - by, - max_branch=kwargs.get("max_branch", None), - divisions=divisions, - set_divisions=True, - ignore_index=True, - shuffle_method=shuffle_method, - ) - - # Ignore divisions if its a dataframe - if isinstance(divisions, cudf.DataFrame): - divisions = None - - # Set index and repartition - df2 = df.map_partitions( - sorting.set_index_post, - index_name=other, - drop=kwargs.get("drop", True), - column_dtype=df.columns.dtype, - ) - npartitions = kwargs.get("npartitions", self.npartitions) - partition_size = kwargs.get("partition_size", None) - if partition_size: - return df2.repartition(partition_size=partition_size) - if not divisions and df2.npartitions != npartitions: - return df2.repartition(npartitions=npartitions) - if divisions and df2.npartitions != len(divisions) - 1: - return df2.repartition(divisions=divisions) - return df2 - - return super().set_index( - other, - sorted=pre_sorted, - shuffle_method=_get_shuffle_method(shuffle_method), - divisions=divisions, - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def sort_values( - self, - by, - ignore_index=False, - max_branch=None, - divisions=None, - set_divisions=False, - ascending=True, - na_position="last", - sort_function=None, - sort_function_kwargs=None, - shuffle_method=None, - **kwargs, - ): - if kwargs: - raise ValueError( - f"Unsupported input arguments passed : {list(kwargs.keys())}" - ) - - df = sorting.sort_values( - self, - by, - max_branch=max_branch, - divisions=divisions, - set_divisions=set_divisions, - ignore_index=ignore_index, - ascending=ascending, - na_position=na_position, - shuffle_method=shuffle_method, - sort_function=sort_function, - sort_function_kwargs=sort_function_kwargs, - ) - - if ignore_index: - return df.reset_index(drop=True) - return df - - @_dask_cudf_performance_tracking - def to_parquet(self, path, *args, **kwargs): - """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" - from dask_cudf.io import to_parquet - - return to_parquet(self, path, *args, **kwargs) - - @_dask_cudf_performance_tracking - def to_orc(self, path, **kwargs): - """Calls dask_cudf.io.to_orc""" - from dask_cudf.io import to_orc - - return to_orc(self, path, **kwargs) - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - numeric_only=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def shuffle(self, *args, shuffle_method=None, **kwargs): - """Wraps dask.dataframe DataFrame.shuffle method""" - return super().shuffle( - *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs - ) - - @_dask_cudf_performance_tracking - def groupby(self, by=None, **kwargs): - from .groupby import CudfDataFrameGroupBy - - return CudfDataFrameGroupBy(self, by=by, **kwargs) - - -@_dask_cudf_performance_tracking -def sum_of_squares(x): - x = x.astype("f8")._column - outcol = libcudf.reduce.reduce("sum_of_squares", x) - return cudf.Series._from_column(outcol) - - -@_dask_cudf_performance_tracking -def var_aggregate(x2, x, n, ddof): - try: - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - result = (x2 / n) - (x / n) ** 2 - if ddof != 0: - result = result * n / (n - ddof) - return result - except ZeroDivisionError: - return np.float64(np.nan) - - -@_dask_cudf_performance_tracking -def nlargest_agg(x, **kwargs): - return cudf.concat(x).nlargest(**kwargs) - - -@_dask_cudf_performance_tracking -def nsmallest_agg(x, **kwargs): - return cudf.concat(x).nsmallest(**kwargs) - - -class Series(_Frame, dd.core.Series): - _partition_type = cudf.Series - - @_dask_cudf_performance_tracking - def count(self, split_every=False): - return reduction( - [self], - chunk=M.count, - aggregate=np.sum, - split_every=split_every, - meta="i8", - ) - - @_dask_cudf_performance_tracking - def mean(self, split_every=False): - sum = self.sum(split_every=split_every) - n = self.count(split_every=split_every) - return sum / n - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var(axis=axis, skipna=skipna) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_dask_cudf_performance_tracking - def groupby(self, *args, **kwargs): - from .groupby import CudfSeriesGroupBy - - return CudfSeriesGroupBy(self, *args, **kwargs) - - @property # type: ignore - @_dask_cudf_performance_tracking - def list(self): - return ListMethods(self) - - @property # type: ignore - @_dask_cudf_performance_tracking - def struct(self): - return StructMethods(self) - - -class Index(Series, dd.core.Index): - _partition_type = cudf.Index # type: ignore - - -@_dask_cudf_performance_tracking -def _naive_var(ddf, meta, skipna, ddof, split_every, out): - num = ddf._get_numeric_data() - x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) - n = num.count(split_every=split_every) - name = ddf._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof - ) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _parallel_var(ddf, meta, skipna, split_every, out): - def _local_var(x, skipna): - if skipna: - n = x.count() - avg = x.mean(skipna=skipna) - else: - # Not skipping nulls, so might as well - # avoid the full `count` operation - n = len(x) - avg = x.sum(skipna=skipna) / n - m2 = ((x - avg) ** 2).sum(skipna=skipna) - return n, avg, m2 - - def _aggregate_var(parts): - n, avg, m2 = parts[0] - for i in range(1, len(parts)): - n_a, avg_a, m2_a = n, avg, m2 - n_b, avg_b, m2_b = parts[i] - n = n_a + n_b - avg = (n_a * avg_a + n_b * avg_b) / n - delta = avg_b - avg_a - m2 = m2_a + m2_b + delta**2 * n_a * n_b / n - return n, avg, m2 - - def _finalize_var(vals): - n, _, m2 = vals - return m2 / (n - 1) - - # Build graph - nparts = ddf.npartitions - if not split_every: - split_every = nparts - name = "var-" + tokenize(skipna, split_every, out) - local_name = "local-" + name - num = ddf._get_numeric_data() - dsk = { - (local_name, n, 0): (_local_var, (num._name, n), skipna) - for n in range(nparts) - } - - # Use reduction tree - widths = [nparts] - while nparts > 1: - nparts = math.ceil(nparts / split_every) - widths.append(nparts) - height = len(widths) - for depth in range(1, height): - for group in range(widths[depth]): - p_max = widths[depth - 1] - lstart = split_every * group - lstop = min(lstart + split_every, p_max) - node_list = [ - (local_name, p, depth - 1) for p in range(lstart, lstop) - ] - dsk[(local_name, group, depth)] = (_aggregate_var, node_list) - if height == 1: - group = depth = 0 - dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) - - graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) - result = dd.core.new_dd_object(graph, name, meta, (None, None)) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _extract_meta(x): - """ - Extract internal cache data (``_meta``) from dask_cudf objects - """ - if isinstance(x, (Scalar, _Frame)): - return x._meta - elif isinstance(x, list): - return [_extract_meta(_x) for _x in x] - elif isinstance(x, tuple): - return tuple(_extract_meta(_x) for _x in x) - elif isinstance(x, dict): - return {k: _extract_meta(v) for k, v in x.items()} - return x - - -@_dask_cudf_performance_tracking -def _emulate(func, *args, **kwargs): - """ - Apply a function using args / kwargs. If arguments contain dd.DataFrame / - dd.Series, using internal cache (``_meta``) for calculation - """ - with raise_on_meta_error(funcname(func)): - return func(*_extract_meta(args), **_extract_meta(kwargs)) - - -@_dask_cudf_performance_tracking -def align_partitions(args): - """Align partitions between dask_cudf objects. - - Note that if all divisions are unknown, but have equal npartitions, then - they will be passed through unchanged. - """ - dfs = [df for df in args if isinstance(df, _Frame)] - if not dfs: - return args - - divisions = dfs[0].divisions - if not all(df.divisions == divisions for df in dfs): - raise NotImplementedError("Aligning mismatched partitions") - return args - - -@_dask_cudf_performance_tracking -def reduction( - args, - chunk=None, - aggregate=None, - combine=None, - meta=None, - token=None, - chunk_kwargs=None, - aggregate_kwargs=None, - combine_kwargs=None, - split_every=None, - **kwargs, -): - """Generic tree reduction operation. - - Parameters - ---------- - args : - Positional arguments for the `chunk` function. All `dask.dataframe` - objects should be partitioned and indexed equivalently. - chunk : function [block-per-arg] -> block - Function to operate on each block of data - aggregate : function list-of-blocks -> block - Function to operate on the list of results of chunk - combine : function list-of-blocks -> block, optional - Function to operate on intermediate lists of results of chunk - in a tree-reduction. If not provided, defaults to aggregate. - $META - token : str, optional - The name to use for the output keys. - chunk_kwargs : dict, optional - Keywords for the chunk function only. - aggregate_kwargs : dict, optional - Keywords for the aggregate function only. - combine_kwargs : dict, optional - Keywords for the combine function only. - split_every : int, optional - Group partitions into groups of this size while performing a - tree-reduction. If set to False, no tree-reduction will be used, - and all intermediates will be concatenated and passed to ``aggregate``. - Default is 8. - kwargs : - All remaining keywords will be passed to ``chunk``, ``aggregate``, and - ``combine``. - """ - if chunk_kwargs is None: - chunk_kwargs = dict() - if aggregate_kwargs is None: - aggregate_kwargs = dict() - chunk_kwargs.update(kwargs) - aggregate_kwargs.update(kwargs) - - if combine is None: - if combine_kwargs: - raise ValueError("`combine_kwargs` provided with no `combine`") - combine = aggregate - combine_kwargs = aggregate_kwargs - else: - if combine_kwargs is None: - combine_kwargs = dict() - combine_kwargs.update(kwargs) - - if not isinstance(args, (tuple, list)): - args = [args] - - npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} - if len(npartitions) > 1: - raise ValueError("All arguments must have same number of partitions") - npartitions = npartitions.pop() - - if split_every is None: - split_every = 8 - elif split_every is False: - split_every = npartitions - elif split_every < 2 or not isinstance(split_every, int): - raise ValueError("split_every must be an integer >= 2") - - token_key = tokenize( - token or (chunk, aggregate), - meta, - args, - chunk_kwargs, - aggregate_kwargs, - combine_kwargs, - split_every, +# This module provides backward compatibility for legacy import patterns. +if dd.DASK_EXPR_ENABLED: + from dask_cudf._expr.collection import ( # noqa: E402 + DataFrame, + Index, + Series, ) +else: + from dask_cudf._legacy.core import DataFrame, Index, Series # noqa: F401 - # Chunk - a = f"{token or funcname(chunk)}-chunk-{token_key}" - if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: - dsk = { - (a, 0, i): (chunk, key) - for i, key in enumerate(args[0].__dask_keys__()) - } - else: - dsk = { - (a, 0, i): ( - apply, - chunk, - [(x._name, i) if isinstance(x, _Frame) else x for x in args], - chunk_kwargs, - ) - for i in range(args[0].npartitions) - } - # Combine - b = f"{token or funcname(combine)}-combine-{token_key}" - k = npartitions - depth = 0 - while k > split_every: - for part_i, inds in enumerate(partition_all(split_every, range(k))): - conc = (list, [(a, depth, i) for i in inds]) - dsk[(b, depth + 1, part_i)] = ( - (apply, combine, [conc], combine_kwargs) - if combine_kwargs - else (combine, conc) - ) - k = part_i + 1 - a = b - depth += 1 - - # Aggregate - b = f"{token or funcname(aggregate)}-agg-{token_key}" - conc = (list, [(a, depth, i) for i in range(k)]) - if aggregate_kwargs: - dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) - else: - dsk[(b, 0)] = (aggregate, conc) - - if meta is None: - meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) - meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) - meta = dask_make_meta(meta) - - graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) - return dd.core.new_dd_object(graph, b, meta, (None, None)) +concat = dd.concat # noqa: F401 @_dask_cudf_performance_tracking @@ -744,59 +64,3 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): # since dask-expr does not provide a docstring for from_pandas. + textwrap.dedent(dd.from_pandas.__doc__ or "") ) - - -@_dask_cudf_performance_tracking -def from_dask_dataframe(df): - """ - Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF - one. - - WARNING: This API is deprecated, and may not work properly - when query-planning is active. Please use `*.to_backend("cudf")` - to convert the underlying data to cudf. - - Parameters - ---------- - df : dask.dataframe.DataFrame - The Dask dataframe to convert - - Returns - ------- - dask_cudf.DataFrame : A new Dask collection backed by cuDF objects - """ - - warnings.warn( - "The `from_dask_dataframe` API is now deprecated. " - "Please use `*.to_backend('cudf')` instead.", - FutureWarning, - ) - - return df.to_backend("cudf") - - -for name in ( - "add", - "sub", - "mul", - "truediv", - "floordiv", - "mod", - "pow", - "radd", - "rsub", - "rmul", - "rtruediv", - "rfloordiv", - "rmod", - "rpow", -): - meth = getattr(cudf.DataFrame, name) - DataFrame._bind_operator_method(name, meth, original=cudf.Series) - - meth = getattr(cudf.Series, name) - Series._bind_operator_method(name, meth, original=cudf.Series) - -for name in ("lt", "gt", "le", "ge", "ne", "eq"): - meth = getattr(cudf.Series, name) - Series._bind_comparison_method(name, meth, original=cudf.Series) diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py deleted file mode 100644 index a76b655ef42..00000000000 --- a/python/dask_cudf/dask_cudf/expr/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from dask import config - -# Check if dask-dataframe is using dask-expr. -# For dask>=2024.3.0, a null value will default to True -QUERY_PLANNING_ON = config.get("dataframe.query-planning", None) is not False - -# Register custom expressions and collections -if QUERY_PLANNING_ON: - # Broadly avoid "p2p" and "disk" defaults for now - config.set({"dataframe.shuffle.method": "tasks"}) - - try: - import dask_cudf.expr._collection - import dask_cudf.expr._expr - - except ImportError as err: - # Dask *should* raise an error before this. - # However, we can still raise here to be certain. - raise RuntimeError( - "Failed to register the 'cudf' backend for dask-expr." - " Please make sure you have dask-expr installed.\n" - f"Error Message: {err}" - ) diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py deleted file mode 100644 index 65688115b59..00000000000 --- a/python/dask_cudf/dask_cudf/expr/_groupby.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from dask_expr._groupby import ( - GroupBy as DXGroupBy, - SeriesGroupBy as DXSeriesGroupBy, - SingleAggregation, -) -from dask_expr._util import is_scalar - -from dask.dataframe.groupby import Aggregation - -from cudf.core.groupby.groupby import _deprecate_collect - -## -## Custom groupby classes -## - - -class ListAgg(SingleAggregation): - @staticmethod - def groupby_chunk(arg): - return arg.agg(list) - - @staticmethod - def groupby_aggregate(arg): - gb = arg.agg(list) - if gb.ndim > 1: - for col in gb.columns: - gb[col] = gb[col].list.concat() - return gb - else: - return gb.list.concat() - - -list_aggregation = Aggregation( - name="list", - chunk=ListAgg.groupby_chunk, - agg=ListAgg.groupby_aggregate, -) - - -def _translate_arg(arg): - # Helper function to translate args so that - # they can be processed correctly by upstream - # dask & dask-expr. Right now, the only necessary - # translation is list aggregations. - if isinstance(arg, dict): - return {k: _translate_arg(v) for k, v in arg.items()} - elif isinstance(arg, list): - return [_translate_arg(x) for x in arg] - elif arg in ("collect", "list", list): - return list_aggregation - else: - return arg - - -# TODO: These classes are mostly a work-around for missing -# `observed=False` support. -# See: https://github.com/rapidsai/cudf/issues/15173 - - -class GroupBy(DXGroupBy): - def __init__(self, *args, observed=None, **kwargs): - observed = observed if observed is not None else True - super().__init__(*args, observed=observed, **kwargs) - - def __getitem__(self, key): - if is_scalar(key): - return SeriesGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - dropna=self.dropna, - observed=self.observed, - ) - g = GroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - dropna=self.dropna, - observed=self.observed, - group_keys=self.group_keys, - ) - return g - - def collect(self, **kwargs): - _deprecate_collect() - return self._single_agg(ListAgg, **kwargs) - - def aggregate(self, arg, **kwargs): - return super().aggregate(_translate_arg(arg), **kwargs) - - -class SeriesGroupBy(DXSeriesGroupBy): - def __init__(self, *args, observed=None, **kwargs): - observed = observed if observed is not None else True - super().__init__(*args, observed=observed, **kwargs) - - def collect(self, **kwargs): - _deprecate_collect() - return self._single_agg(ListAgg, **kwargs) - - def aggregate(self, arg, **kwargs): - return super().aggregate(_translate_arg(arg), **kwargs) diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 76bb2ea99b4..212951336c9 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,11 +1,37 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. -from .csv import read_csv -from .json import read_json -from .orc import read_orc, to_orc -from .text import read_text +from dask_cudf import _deprecated_api, QUERY_PLANNING_ON -try: - from .parquet import read_parquet, to_parquet -except ImportError: - pass +from . import csv, orc, json, parquet, text # noqa: F401 + + +read_csv = _deprecated_api( + "dask_cudf.io.read_csv", new_api="dask_cudf.read_csv" +) +read_json = _deprecated_api( + "dask_cudf.io.read_json", new_api="dask_cudf.read_json" +) +read_orc = _deprecated_api( + "dask_cudf.io.read_orc", new_api="dask_cudf.read_orc" +) +to_orc = _deprecated_api( + "dask_cudf.io.to_orc", + new_api="dask_cudf._legacy.io.to_orc", + rec="Please use the DataFrame.to_orc method instead.", +) +read_text = _deprecated_api( + "dask_cudf.io.read_text", new_api="dask_cudf.read_text" +) +if QUERY_PLANNING_ON: + read_parquet = parquet.read_parquet +else: + read_parquet = _deprecated_api( + "The legacy dask_cudf.io.read_parquet API", + new_api="dask_cudf.read_parquet", + rec="", + ) +to_parquet = _deprecated_api( + "dask_cudf.io.to_parquet", + new_api="dask_cudf._legacy.io.parquet.to_parquet", + rec="Please use the DataFrame.to_parquet method instead.", +) diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index fa5400344f9..b22b31a591f 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -1,222 +1,8 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. -import os -from glob import glob -from warnings import warn +from dask_cudf import _deprecated_api -from fsspec.utils import infer_compression - -from dask import dataframe as dd -from dask.base import tokenize -from dask.dataframe.io.csv import make_reader -from dask.utils import apply, parse_bytes - -import cudf - - -def read_csv(path, blocksize="default", **kwargs): - """ - Read CSV files into a :class:`.DataFrame`. - - This API parallelizes the :func:`cudf:cudf.read_csv` function in - the following ways: - - It supports loading many files at once using globstrings: - - >>> import dask_cudf - >>> df = dask_cudf.read_csv("myfiles.*.csv") - - In some cases it can break up large files: - - >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - - It can read CSV files from external resources (e.g. S3, HTTP, FTP) - - >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") - >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") - - Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and - supports many of the same keyword arguments with the same - performance guarantees. See the docstring for - :func:`cudf:cudf.read_csv` for more information on available - keyword arguments. - - Parameters - ---------- - path : str, path object, or file-like object - Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as - builtin :py:func:`open` file handler function or - :py:class:`~io.StringIO`). - blocksize : int or str, default "256 MiB" - The target task partition size. If ``None``, a single block - is used for each file. - **kwargs : dict - Passthrough key-word arguments that are sent to - :func:`cudf:cudf.read_csv`. - - Notes - ----- - If any of `skipfooter`/`skiprows`/`nrows` are passed, - `blocksize` will default to None. - - Examples - -------- - >>> import dask_cudf - >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) - >>> ddf.compute() - a b - 0 1 hi - 1 2 hello - 2 3 ai - - """ - - # Handle `chunksize` deprecation - if "chunksize" in kwargs: - chunksize = kwargs.pop("chunksize", "default") - warn( - "`chunksize` is deprecated and will be removed in the future. " - "Please use `blocksize` instead.", - FutureWarning, - ) - if blocksize == "default": - blocksize = chunksize - - # Set default `blocksize` - if blocksize == "default": - if ( - kwargs.get("skipfooter", 0) != 0 - or kwargs.get("skiprows", 0) != 0 - or kwargs.get("nrows", None) is not None - ): - # Cannot read in blocks if skipfooter, - # skiprows or nrows is passed. - blocksize = None - else: - blocksize = "256 MiB" - - if "://" in str(path): - func = make_reader(cudf.read_csv, "read_csv", "CSV") - return func(path, blocksize=blocksize, **kwargs) - else: - return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) - - -def _internal_read_csv(path, blocksize="256 MiB", **kwargs): - if isinstance(blocksize, str): - blocksize = parse_bytes(blocksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - name = "read-csv-" + tokenize( - path, tokenize, **kwargs - ) # TODO: get last modified time - - compression = kwargs.get("compression", "infer") - - if compression == "infer": - # Infer compression from first path by default - compression = infer_compression(filenames[0]) - - if compression and blocksize: - # compressed CSVs reading must read the entire file - kwargs.pop("byte_range", None) - warn( - "Warning %s compression does not support breaking apart files\n" - "Please ensure that each individual file can fit in memory and\n" - "use the keyword ``blocksize=None to remove this message``\n" - "Setting ``blocksize=(size of file)``" % compression - ) - blocksize = None - - if blocksize is None: - return read_csv_without_blocksize(path, **kwargs) - - # Let dask.dataframe generate meta - dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") - kwargs1 = kwargs.copy() - usecols = kwargs1.pop("usecols", None) - dtype = kwargs1.pop("dtype", None) - meta = dask_reader(filenames[0], **kwargs1)._meta - names = meta.columns - if usecols or dtype: - # Regenerate meta with original kwargs if - # `usecols` or `dtype` was specified - meta = dask_reader(filenames[0], **kwargs)._meta - - dsk = {} - i = 0 - dtypes = meta.dtypes.values - - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, blocksize): - kwargs2 = kwargs.copy() - kwargs2["byte_range"] = ( - start, - blocksize, - ) # specify which chunk of the file we care about - if start != 0: - kwargs2["names"] = names # no header in the middle of the file - kwargs2["header"] = None - dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) - - i += 1 - - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) - - -def _read_csv(fn, dtypes=None, **kwargs): - return cudf.read_csv(fn, **kwargs) - - -def read_csv_without_blocksize(path, **kwargs): - """Read entire CSV with optional compression (gzip/zip) - - Parameters - ---------- - path : str - path to files (support for glob) - """ - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - name = "read-csv-" + tokenize(path, **kwargs) - - meta_kwargs = kwargs.copy() - if "skipfooter" in meta_kwargs: - meta_kwargs.pop("skipfooter") - if "nrows" in meta_kwargs: - meta_kwargs.pop("nrows") - # Read "head" of first file (first 5 rows). - # Convert to empty df for metadata. - meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] - - graph = { - (name, i): (apply, cudf.read_csv, [fn], kwargs) - for i, fn in enumerate(filenames) - } - - divisions = [None] * (len(filenames) + 1) - - return dd.core.new_dd_object(graph, name, meta, divisions) +read_csv = _deprecated_api( + "dask_cudf.io.csv.read_csv", + new_api="dask_cudf.read_csv", +) diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index 98c5ceedb76..8f85ea54c0a 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -1,209 +1,8 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. -from functools import partial +from dask_cudf import _deprecated_api -import numpy as np -from fsspec.core import get_compression, get_fs_token_paths - -import dask -from dask.utils import parse_bytes - -import cudf -from cudf.core.column import as_column -from cudf.utils.ioutils import _is_local_filesystem - -from dask_cudf.backends import _default_backend - - -def _read_json_partition( - paths, - fs=None, - include_path_column=False, - path_converter=None, - **kwargs, -): - # Transfer all data up front for remote storage - sources = ( - paths - if fs is None - else fs.cat_ranges( - paths, - [0] * len(paths), - fs.sizes(paths), - ) - ) - - if include_path_column: - # Add "path" column. - # Must iterate over sources sequentially - if not isinstance(include_path_column, str): - include_path_column = "path" - converted_paths = ( - paths - if path_converter is None - else [path_converter(path) for path in paths] - ) - dfs = [] - for i, source in enumerate(sources): - df = cudf.read_json(source, **kwargs) - df[include_path_column] = as_column( - converted_paths[i], length=len(df) - ) - dfs.append(df) - return cudf.concat(dfs) - else: - # Pass sources directly to cudf - return cudf.read_json(sources, **kwargs) - - -def read_json( - url_path, - engine="auto", - blocksize=None, - orient="records", - lines=None, - compression="infer", - aggregate_files=True, - **kwargs, -): - """Read JSON data into a :class:`.DataFrame`. - - This function wraps :func:`dask.dataframe.read_json`, and passes - ``engine=partial(cudf.read_json, engine="auto")`` by default. - - Parameters - ---------- - url_path : str, list of str - Location to read from. If a string, can include a glob character to - find a set of file names. - Supports protocol specifications such as ``"s3://"``. - engine : str or Callable, default "auto" - - If str, this value will be used as the ``engine`` argument - when :func:`cudf.read_json` is used to create each partition. - If a :obj:`~collections.abc.Callable`, this value will be used as the - underlying function used to create each partition from JSON - data. The default value is "auto", so that - ``engine=partial(cudf.read_json, engine="auto")`` will be - passed to :func:`dask.dataframe.read_json` by default. - aggregate_files : bool or int - Whether to map multiple files to each output partition. If True, - the `blocksize` argument will be used to determine the number of - files in each partition. If any one file is larger than `blocksize`, - the `aggregate_files` argument will be ignored. If an integer value - is specified, the `blocksize` argument will be ignored, and that - number of files will be mapped to each partition. Default is True. - **kwargs : - Key-word arguments to pass through to :func:`dask.dataframe.read_json`. - - Returns - ------- - :class:`.DataFrame` - - Examples - -------- - Load single file - - >>> from dask_cudf import read_json - >>> read_json('myfile.json') # doctest: +SKIP - - Load large line-delimited JSON files using partitions of approx - 256MB size - - >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP - - Load nested JSON data - - >>> read_json('myfile.json') # doctest: +SKIP - - See Also - -------- - dask.dataframe.read_json - - """ - - if lines is None: - lines = orient == "records" - if orient != "records" and lines: - raise ValueError( - 'Line-delimited JSON is only available with orient="records".' - ) - if blocksize and (orient != "records" or not lines): - raise ValueError( - "JSON file chunking only allowed for JSON-lines" - "input (orient='records', lines=True)." - ) - - inputs = [] - if aggregate_files and blocksize or int(aggregate_files) > 1: - # Attempt custom read if we are mapping multiple files - # to each output partition. Otherwise, upstream logic - # is sufficient. - - storage_options = kwargs.get("storage_options", {}) - fs, _, paths = get_fs_token_paths( - url_path, mode="rb", storage_options=storage_options - ) - if isinstance(aggregate_files, int) and aggregate_files > 1: - # Map a static file count to each partition - inputs = [ - paths[offset : offset + aggregate_files] - for offset in range(0, len(paths), aggregate_files) - ] - elif aggregate_files is True and blocksize: - # Map files dynamically (using blocksize) - file_sizes = fs.sizes(paths) # NOTE: This can be slow - blocksize = parse_bytes(blocksize) - if all([file_size <= blocksize for file_size in file_sizes]): - counts = np.unique( - np.floor(np.cumsum(file_sizes) / blocksize), - return_counts=True, - )[1] - offsets = np.concatenate([[0], counts.cumsum()]) - inputs = [ - paths[offsets[i] : offsets[i + 1]] - for i in range(len(offsets) - 1) - ] - - if inputs: - # Inputs were successfully populated. - # Use custom _read_json_partition function - # to generate each partition. - - compression = get_compression( - url_path[0] if isinstance(url_path, list) else url_path, - compression, - ) - _kwargs = dict( - orient=orient, - lines=lines, - compression=compression, - include_path_column=kwargs.get("include_path_column", False), - path_converter=kwargs.get("path_converter"), - ) - if not _is_local_filesystem(fs): - _kwargs["fs"] = fs - # TODO: Generate meta more efficiently - meta = _read_json_partition(inputs[0][:1], **_kwargs) - return dask.dataframe.from_map( - _read_json_partition, - inputs, - meta=meta, - **_kwargs, - ) - - # Fall back to dask.dataframe.read_json - return _default_backend( - dask.dataframe.read_json, - url_path, - engine=( - partial(cudf.read_json, engine=engine) - if isinstance(engine, str) - else engine - ), - blocksize=blocksize, - orient=orient, - lines=lines, - compression=compression, - **kwargs, - ) +read_json = _deprecated_api( + "dask_cudf.io.json.read_json", + new_api="dask_cudf.read_json", +) diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index bed69f038b0..5219cdacc31 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,199 +1,13 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from io import BufferedWriter, IOBase - -from fsspec.core import get_fs_token_paths -from fsspec.utils import stringify_path -from pyarrow import orc as orc - -from dask import dataframe as dd -from dask.base import tokenize -from dask.dataframe.io.utils import _get_pyarrow_dtypes - -import cudf - - -def _read_orc_stripe(fs, path, stripe, columns, kwargs=None): - """Pull out specific columns from specific stripe""" - if kwargs is None: - kwargs = {} - with fs.open(path, "rb") as f: - df_stripe = cudf.read_orc( - f, stripes=[stripe], columns=columns, **kwargs - ) - return df_stripe - - -def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): - """Read ORC files into a :class:`.DataFrame`. - - Note that this function is mostly borrowed from upstream Dask. - - Parameters - ---------- - path : str or list[str] - Location of file(s), which can be a full URL with protocol specifier, - and may include glob character if a single string. - columns : None or list[str] - Columns to load. If None, loads all. - filters : None or list of tuple or list of lists of tuples - If not None, specifies a filter predicate used to filter out - row groups using statistics stored for each row group as - Parquet metadata. Row groups that do not match the given - filter predicate are not read. The predicate is expressed in - `disjunctive normal form (DNF) - `__ - like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary - boolean logical combinations of single column predicates. The - innermost tuples each describe a single column predicate. The - list of inner predicates is interpreted as a conjunction - (AND), forming a more selective and multiple column predicate. - Finally, the outermost list combines these filters as a - disjunction (OR). Predicates may also be passed as a list of - tuples. This form is interpreted as a single conjunction. To - express OR in predicates, one must use the (preferred) - notation of list of lists of tuples. - storage_options : None or dict - Further parameters to pass to the bytes backend. - - See Also - -------- - dask.dataframe.read_orc - - Returns - ------- - dask_cudf.DataFrame - - """ - - storage_options = storage_options or {} - fs, fs_token, paths = get_fs_token_paths( - path, mode="rb", storage_options=storage_options - ) - schema = None - nstripes_per_file = [] - for path in paths: - with fs.open(path, "rb") as f: - o = orc.ORCFile(f) - if schema is None: - schema = o.schema - elif schema != o.schema: - raise ValueError( - "Incompatible schemas while parsing ORC files" - ) - nstripes_per_file.append(o.nstripes) - schema = _get_pyarrow_dtypes(schema, categories=None) - if columns is not None: - ex = set(columns) - set(schema) - if ex: - raise ValueError( - f"Requested columns ({ex}) not in schema ({set(schema)})" - ) - else: - columns = list(schema) - - with fs.open(paths[0], "rb") as f: - meta = cudf.read_orc( - f, - stripes=[0] if nstripes_per_file[0] else None, - columns=columns, - **kwargs, - ) - - name = "read-orc-" + tokenize(fs_token, path, columns, filters, **kwargs) - dsk = {} - N = 0 - for path, n in zip(paths, nstripes_per_file): - for stripe in ( - range(n) - if filters is None - else cudf.io.orc._filter_stripes(filters, path) - ): - dsk[(name, N)] = ( - _read_orc_stripe, - fs, - path, - stripe, - columns, - kwargs, - ) - N += 1 - - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) - - -def write_orc_partition(df, path, fs, filename, compression="snappy"): - full_path = fs.sep.join([path, filename]) - with fs.open(full_path, mode="wb") as out_file: - if not isinstance(out_file, IOBase): - out_file = BufferedWriter(out_file) - cudf.io.to_orc(df, out_file, compression=compression) - return full_path - - -def to_orc( - df, - path, - write_index=True, - storage_options=None, - compression="snappy", - compute=True, - **kwargs, -): - """ - Write a :class:`.DataFrame` to ORC file(s) (one file per partition). - - Parameters - ---------- - df : DataFrame - path : str or pathlib.Path - Destination directory for data. Prepend with protocol like ``s3://`` - or ``hdfs://`` for remote data. - write_index : boolean, optional - Whether or not to write the index. Defaults to True. - storage_options : None or dict - Further parameters to pass to the bytes backend. - compression : string or dict, optional - compute : bool, optional - If True (default) then the result is computed immediately. If - False then a :class:`~dask.delayed.Delayed` object is returned - for future computation. - - """ - - from dask import compute as dask_compute, delayed - - # TODO: Use upstream dask implementation once available - # (see: Dask Issue#5596) - - if hasattr(path, "name"): - path = stringify_path(path) - fs, _, _ = get_fs_token_paths( - path, mode="wb", storage_options=storage_options - ) - # Trim any protocol information from the path before forwarding - path = fs._strip_protocol(path) - - if write_index: - df = df.reset_index() - else: - # Not writing index - might as well drop it - df = df.reset_index(drop=True) - - fs.mkdirs(path, exist_ok=True) - - # Use i_offset and df.npartitions to define file-name list - filenames = ["part.%i.orc" % i for i in range(df.npartitions)] - - # write parts - dwrite = delayed(write_orc_partition) - parts = [ - dwrite(d, path, fs, filename, compression=compression) - for d, filename in zip(df.to_delayed(), filenames) - ] - - if compute: - return dask_compute(*parts) - - return delayed(list)(parts) +# Copyright (c) 2024, NVIDIA CORPORATION. + +from dask_cudf import _deprecated_api + +read_orc = _deprecated_api( + "dask_cudf.io.orc.read_orc", + new_api="dask_cudf.read_orc", +) +to_orc = _deprecated_api( + "dask_cudf.io.orc.to_orc", + new_api="dask_cudf._legacy.io.orc.to_orc", + rec="Please use the DataFrame.to_orc method instead.", +) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index a781b8242fe..bbedd046760 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,35 +1,262 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. + +from __future__ import annotations + +import functools import itertools +import math +import os import warnings -from functools import partial -from io import BufferedWriter, BytesIO, IOBase +from typing import TYPE_CHECKING, Any import numpy as np import pandas as pd -from pyarrow import dataset as pa_ds, parquet as pq +from dask_expr._expr import Elemwise +from dask_expr._util import _convert_to_list +from dask_expr.io.io import FusedIO, FusedParquetIO +from dask_expr.io.parquet import ( + FragmentWrapper, + ReadParquetFSSpec, + ReadParquetPyarrowFS, +) + +from dask._task_spec import Task +from dask.dataframe.io.parquet.arrow import _filters_to_expression +from dask.dataframe.io.parquet.core import ParquetFunctionWrapper +from dask.tokenize import tokenize +from dask.utils import parse_bytes -from dask import dataframe as dd -from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine +import cudf -try: - from dask.dataframe.io.parquet import ( - create_metadata_file as create_metadata_file_dd, - ) -except ImportError: - create_metadata_file_dd = None +from dask_cudf import QUERY_PLANNING_ON, _deprecated_api -import cudf -from cudf.core.column import CategoricalColumn, as_column -from cudf.io import write_to_dataset -from cudf.io.parquet import _apply_post_filters, _normalize_filters -from cudf.utils.dtypes import cudf_dtype_from_pa_type +# Dask-expr imports CudfEngine from this module +from dask_cudf._legacy.io.parquet import CudfEngine # noqa: F401 +if TYPE_CHECKING: + from collections.abc import MutableMapping -class CudfEngine(ArrowDatasetEngine): - @classmethod - def _create_dd_meta(cls, dataset_info, **kwargs): - # Start with pandas-version of meta - meta_pd = super()._create_dd_meta(dataset_info, **kwargs) + +_DEVICE_SIZE_CACHE: int | None = None + + +def _get_device_size(): + try: + # Use PyNVML to find the worker device size. + import pynvml + + pynvml.nvmlInit() + index = os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0] + if index and not index.isnumeric(): + # This means index is UUID. This works for both MIG and non-MIG device UUIDs. + handle = pynvml.nvmlDeviceGetHandleByUUID(str.encode(index)) + else: + # This is a device index + handle = pynvml.nvmlDeviceGetHandleByIndex(int(index)) + return pynvml.nvmlDeviceGetMemoryInfo(handle).total + + except ValueError: + # Fall back to a conservative 8GiB default + return 8 * 1024**3 + + +def _normalize_blocksize(fraction: float = 0.03125): + # Set the blocksize to fraction * . + # We use the smallest worker device to set . + # (Default blocksize is 1/32 * ) + global _DEVICE_SIZE_CACHE + + if _DEVICE_SIZE_CACHE is None: + try: + # Check distributed workers (if a client exists) + from distributed import get_client + + client = get_client() + # TODO: Check "GPU" worker resources only. + # Depends on (https://github.com/rapidsai/dask-cuda/pull/1401) + device_size = min(client.run(_get_device_size).values()) + except (ImportError, ValueError): + device_size = _get_device_size() + _DEVICE_SIZE_CACHE = device_size + + return int(_DEVICE_SIZE_CACHE * fraction) + + +class NoOp(Elemwise): + # Workaround - Always wrap read_parquet operations + # in a NoOp to trigger tune_up optimizations. + _parameters = ["frame"] + _is_length_preserving = True + _projection_passthrough = True + _filter_passthrough = True + _preserves_partitioning_information = True + + @staticmethod + def operation(x): + return x + + +class CudfReadParquetFSSpec(ReadParquetFSSpec): + _STATS_CACHE: MutableMapping[str, Any] = {} + + def approx_statistics(self): + # Use a few files to approximate column-size statistics + key = tokenize(self._dataset_info["ds"].files[:10], self.filters) + try: + return self._STATS_CACHE[key] + + except KeyError: + # Account for filters + ds_filters = None + if self.filters is not None: + ds_filters = _filters_to_expression(self.filters) + + # Use average total_uncompressed_size of three files + n_sample = 3 + column_sizes = {} + for i, frag in enumerate( + self._dataset_info["ds"].get_fragments(ds_filters) + ): + md = frag.metadata + for rg in range(md.num_row_groups): + row_group = md.row_group(rg) + for col in range(row_group.num_columns): + column = row_group.column(col) + name = column.path_in_schema + if name not in column_sizes: + column_sizes[name] = np.zeros( + n_sample, dtype="int64" + ) + column_sizes[name][i] += column.total_uncompressed_size + if (i + 1) >= n_sample: + break + + # Reorganize stats to look like arrow-fs version + self._STATS_CACHE[key] = { + "columns": [ + { + "path_in_schema": name, + "total_uncompressed_size": np.mean(sizes), + } + for name, sizes in column_sizes.items() + ] + } + return self._STATS_CACHE[key] + + @functools.cached_property + def _fusion_compression_factor(self): + # Disable fusion when blocksize=None + if self.blocksize is None: + return 1 + + # At this point, we *may* have used `blockwise` + # already to split or aggregate files. We don't + # *know* if the current partitions correspond to + # individual/full files, multiple/aggregated files + # or partial/split files. + # + # Therefore, we need to use the statistics from + # a few files to estimate the current partition + # size. This size should be similar to `blocksize` + # *if* aggregate_files is True or if the files + # are *smaller* than `blocksize`. + + # Step 1: Sample statistics + approx_stats = self.approx_statistics() + projected_size, original_size = 0, 0 + col_op = self.operand("columns") or self.columns + for col in approx_stats["columns"]: + original_size += col["total_uncompressed_size"] + if col["path_in_schema"] in col_op or ( + (split_name := col["path_in_schema"].split(".")) + and split_name[0] in col_op + ): + projected_size += col["total_uncompressed_size"] + if original_size < 1 or projected_size < 1: + return 1 + + # Step 2: Estimate the correction factor + # (Correct for possible pre-optimization fusion/splitting) + blocksize = parse_bytes(self.blocksize) + if original_size > blocksize: + # Input files are bigger than blocksize + # and we already split these large files. + # (correction_factor > 1) + correction_factor = original_size / blocksize + elif self.aggregate_files: + # Input files are smaller than blocksize + # and we already aggregate small files. + # (correction_factor == 1) + correction_factor = 1 + else: + # Input files are smaller than blocksize + # but we haven't aggregate small files yet. + # (correction_factor < 1) + correction_factor = original_size / blocksize + + # Step 3. Estimate column-projection factor + if self.operand("columns") is None: + projection_factor = 1 + else: + projection_factor = projected_size / original_size + + return max(projection_factor * correction_factor, 0.001) + + def _tune_up(self, parent): + if self._fusion_compression_factor >= 1: + return + if isinstance(parent, FusedIO): + return + return parent.substitute(self, CudfFusedIO(self)) + + +class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS): + _parameters = [ + "path", + "columns", + "filters", + "categories", + "index", + "storage_options", + "filesystem", + "blocksize", + "ignore_metadata_file", + "calculate_divisions", + "arrow_to_pandas", + "pyarrow_strings_enabled", + "kwargs", + "_partitions", + "_series", + "_dataset_info_cache", + ] + _defaults = { + "columns": None, + "filters": None, + "categories": None, + "index": None, + "storage_options": None, + "filesystem": None, + "blocksize": "256 MiB", + "ignore_metadata_file": True, + "calculate_divisions": False, + "arrow_to_pandas": None, + "pyarrow_strings_enabled": True, + "kwargs": None, + "_partitions": None, + "_series": False, + "_dataset_info_cache": None, + } + + @functools.cached_property + def _dataset_info(self): + from dask_cudf._legacy.io.parquet import ( + set_object_dtypes_from_pa_schema, + ) + + dataset_info = super()._dataset_info + meta_pd = dataset_info["base_meta"] + if isinstance(meta_pd, cudf.DataFrame): + return dataset_info # Convert to cudf # (drop unsupported timezone information) @@ -45,37 +272,56 @@ def _create_dd_meta(cls, dataset_info, **kwargs): kwargs.get("schema", None), ) - return meta_cudf + dataset_info["base_meta"] = meta_cudf + self.operands[type(self)._parameters.index("_dataset_info_cache")] = ( + dataset_info + ) + return dataset_info - @classmethod - def multi_support(cls): - # Assert that this class is CudfEngine - # and that multi-part reading is supported - return cls == CudfEngine + @staticmethod + def _table_to_pandas(table, index_name): + if isinstance(table, cudf.DataFrame): + df = table + else: + df = cudf.DataFrame.from_arrow(table) + if index_name is not None: + return df.set_index(index_name) + return df - @classmethod - def _read_paths( - cls, - paths, - fs, - columns=None, - row_groups=None, - filters=None, - partitions=None, - partitioning=None, - partition_keys=None, - open_file_options=None, - dataset_kwargs=None, - **kwargs, + @staticmethod + def _fragments_to_cudf_dataframe( + fragment_wrappers, + filters, + columns, + schema, ): - # Simplify row_groups if all None - if row_groups == [None for path in paths]: - row_groups = None + from dask.dataframe.io.utils import _is_local_fs + + from cudf.io.parquet import _apply_post_filters, _normalize_filters + + if not isinstance(fragment_wrappers, list): + fragment_wrappers = [fragment_wrappers] + + filesystem = None + paths, row_groups = [], [] + for fw in fragment_wrappers: + frag = fw.fragment if isinstance(fw, FragmentWrapper) else fw + paths.append(frag.path) + row_groups.append( + [rg.id for rg in frag.row_groups] if frag.row_groups else None + ) + if filesystem is None: + filesystem = frag.filesystem + + if _is_local_fs(filesystem): + filesystem = None + else: + from fsspec.implementations.arrow import ArrowFSWrapper + + filesystem = ArrowFSWrapper(filesystem) + protocol = filesystem.protocol + paths = [f"{protocol}://{path}" for path in paths] - # Make sure we read in the columns needed for row-wise - # filtering after IO. This means that one or more columns - # will be dropped almost immediately after IO. However, - # we do NEED these columns for accurate filtering. filters = _normalize_filters(filters) projected_columns = None if columns and filters: @@ -85,46 +331,19 @@ def _read_paths( | set(projected_columns) ) - dataset_kwargs = dataset_kwargs or {} - dataset_kwargs["partitioning"] = partitioning or "hive" + if row_groups == [None for path in paths]: + row_groups = None - # Use cudf to read in data - try: - df = cudf.read_parquet( - paths, - engine="cudf", - columns=columns, - row_groups=row_groups if row_groups else None, - dataset_kwargs=dataset_kwargs, - categorical_partitions=False, - filesystem=fs, - **kwargs, - ) - except RuntimeError as err: - # TODO: Remove try/except after null-schema issue is resolved - # (See: https://github.com/rapidsai/cudf/issues/12702) - if len(paths) > 1: - df = cudf.concat( - [ - cudf.read_parquet( - path, - engine="cudf", - columns=columns, - row_groups=row_groups[i] if row_groups else None, - dataset_kwargs=dataset_kwargs, - categorical_partitions=False, - filesystem=fs, - **kwargs, - ) - for i, path in enumerate(paths) - ] - ) - else: - raise err + df = cudf.read_parquet( + paths, + columns=columns, + filters=filters, + row_groups=row_groups, + dataset_kwargs={"schema": schema}, + ) # Apply filters (if any are defined) df = _apply_post_filters(df, filters) - if projected_columns: # Elements of `projected_columns` may now be in the index. # We must filter these names from our projection @@ -133,375 +352,493 @@ def _read_paths( ] df = df[projected_columns] - if partitions and partition_keys is None: - # Use `HivePartitioning` by default - ds = pa_ds.dataset( - paths, - filesystem=fs, - **dataset_kwargs, + # TODO: Deal with hive partitioning. + # Note that ReadParquetPyarrowFS does NOT support this yet anyway. + return df + + @functools.cached_property + def _use_device_io(self): + from dask.dataframe.io.utils import _is_local_fs + + # Use host for remote filesystem only + # (Unless we are using kvikio-S3) + return _is_local_fs(self.fs) or ( + self.fs.type_name == "s3" and cudf.get_option("kvikio_remote_io") + ) + + def _filtered_task(self, name, index: int): + columns = self.columns.copy() + index_name = self.index.name + if self.index is not None: + index_name = self.index.name + schema = self._dataset_info["schema"].remove_metadata() + if index_name: + if columns is None: + columns = list(schema.names) + columns.append(index_name) + + frag_to_table = self._fragment_to_table + if self._use_device_io: + frag_to_table = self._fragments_to_cudf_dataframe + + return Task( + name, + self._table_to_pandas, + Task( + None, + frag_to_table, + fragment_wrapper=FragmentWrapper( + self.fragments[index], filesystem=self.fs + ), + filters=self.filters, + columns=columns, + schema=schema, + ), + index_name=index_name, + ) + + @property + def _fusion_compression_factor(self): + blocksize = self.blocksize + if blocksize is None: + return 1 + elif blocksize == "default": + blocksize = "256MiB" + + projected_size = 0 + approx_stats = self.approx_statistics() + col_op = self.operand("columns") or self.columns + for col in approx_stats["columns"]: + if col["path_in_schema"] in col_op or ( + (split_name := col["path_in_schema"].split(".")) + and split_name[0] in col_op + ): + projected_size += col["total_uncompressed_size"] + + if projected_size < 1: + return 1 + + aggregate_files = max(1, int(parse_bytes(blocksize) / projected_size)) + return max(1 / aggregate_files, 0.001) + + def _tune_up(self, parent): + if self._fusion_compression_factor >= 1: + return + fused_cls = ( + CudfFusedParquetIO + if self._use_device_io + else CudfFusedParquetIOHost + ) + if isinstance(parent, fused_cls): + return + return parent.substitute(self, fused_cls(self)) + + +class CudfFusedIO(FusedIO): + def _task(self, name, index: int): + expr = self.operand("_expr") + bucket = self._fusion_buckets[index] + io_func = expr._filtered_task(name, 0).func + if not isinstance( + io_func, ParquetFunctionWrapper + ) or io_func.common_kwargs.get("partitions", None): + # Just use "simple" fusion if we have an unexpected + # callable, or we are dealing with hive partitioning. + return Task( + name, + cudf.concat, + [expr._filtered_task(name, i) for i in bucket], ) - frag = next(ds.get_fragments()) - if frag: - # Extract hive-partition keys, and make sure they - # are ordered the same as they are in `partitions` - raw_keys = pa_ds._get_partition_keys(frag.partition_expression) - partition_keys = [ - (hive_part.name, raw_keys[hive_part.name]) - for hive_part in partitions - ] - if partition_keys: - if partitions is None: - raise ValueError("Must pass partition sets") - - for i, (name, index2) in enumerate(partition_keys): - if len(partitions[i].keys): - # Build a categorical column from `codes` directly - # (since the category is often a larger dtype) - codes = as_column( - partitions[i].keys.get_loc(index2), - length=len(df), - ) - df[name] = CategoricalColumn( - data=None, - size=codes.size, - dtype=cudf.CategoricalDtype( - categories=partitions[i].keys, ordered=False - ), - offset=codes.offset, - children=(codes,), - ) - elif name not in df.columns: - # Add non-categorical partition column - df[name] = as_column(index2, length=len(df)) + pieces = [] + for i in bucket: + piece = expr._filtered_task(name, i).args[0] + if isinstance(piece, list): + pieces.extend(piece) + else: + pieces.append(piece) + return Task(name, io_func, pieces) + - return df +class CudfFusedParquetIO(FusedParquetIO): + @functools.cached_property + def _fusion_buckets(self): + partitions = self.operand("_expr")._partitions + npartitions = len(partitions) + + step = math.ceil(1 / self.operand("_expr")._fusion_compression_factor) + + # TODO: Heuristic to limit fusion should probably + # account for the number of workers. For now, just + # limiting fusion to 100 partitions at once. + step = min(step, 100) + + buckets = [ + partitions[i : i + step] for i in range(0, npartitions, step) + ] + return buckets @classmethod - def read_partition( + def _load_multiple_files( cls, - fs, - pieces, + frag_filters, columns, - index, - categories=(), - partitions=(), - filters=None, - partitioning=None, - schema=None, - open_file_options=None, - **kwargs, + schema, + **to_pandas_kwargs, ): - if columns is not None: - columns = [c for c in columns] - if isinstance(index, list): - columns += index - - dataset_kwargs = kwargs.get("dataset", {}) - partitioning = partitioning or dataset_kwargs.get("partitioning", None) - if isinstance(partitioning, dict): - partitioning = pa_ds.partitioning(**partitioning) - - # Check if we are actually selecting any columns - read_columns = columns - if schema and columns: - ignored = set(schema.names) - set(columns) - if not ignored: - read_columns = None - - if not isinstance(pieces, list): - pieces = [pieces] - - # Extract supported kwargs from `kwargs` - read_kwargs = kwargs.get("read", {}) - read_kwargs.update(open_file_options or {}) - check_file_size = read_kwargs.pop("check_file_size", None) - - # Wrap reading logic in a `try` block so that we can - # inform the user that the `read_parquet` partition - # size is too large for the available memory - try: - # Assume multi-piece read - paths = [] - rgs = [] - last_partition_keys = None - dfs = [] - - for i, piece in enumerate(pieces): - (path, row_group, partition_keys) = piece - row_group = None if row_group == [None] else row_group - - # File-size check to help "protect" users from change - # to up-stream `split_row_groups` default. We only - # check the file size if this partition corresponds - # to a full file, and `check_file_size` is defined - if check_file_size and len(pieces) == 1 and row_group is None: - file_size = fs.size(path) - if file_size > check_file_size: - warnings.warn( - f"A large parquet file ({file_size}B) is being " - f"used to create a DataFrame partition in " - f"read_parquet. This may cause out of memory " - f"exceptions in operations downstream. See the " - f"notes on split_row_groups in the read_parquet " - f"documentation. Setting split_row_groups " - f"explicitly will silence this warning." - ) - - if i > 0 and partition_keys != last_partition_keys: - dfs.append( - cls._read_paths( - paths, - fs, - columns=read_columns, - row_groups=rgs if rgs else None, - filters=filters, - partitions=partitions, - partitioning=partitioning, - partition_keys=last_partition_keys, - dataset_kwargs=dataset_kwargs, - **read_kwargs, - ) - ) - paths = [] - rgs = [] - last_partition_keys = None - paths.append(path) - rgs.append( - [row_group] - if not isinstance(row_group, list) - and row_group is not None - else row_group - ) - last_partition_keys = partition_keys - - dfs.append( - cls._read_paths( - paths, - fs, - columns=read_columns, - row_groups=rgs if rgs else None, - filters=filters, - partitions=partitions, - partitioning=partitioning, - partition_keys=last_partition_keys, - dataset_kwargs=dataset_kwargs, - **read_kwargs, - ) - ) - df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] - - # Re-set "object" dtypes align with pa schema - set_object_dtypes_from_pa_schema(df, schema) - - if index and (index[0] in df.columns): - df = df.set_index(index[0]) - elif index is False and df.index.names != [None]: - # If index=False, we shouldn't have a named index - df.reset_index(inplace=True) - - except MemoryError as err: - raise MemoryError( - "Parquet data was larger than the available GPU memory!\n\n" - "See the notes on split_row_groups in the read_parquet " - "documentation.\n\n" - "Original Error: " + str(err) - ) - raise err + frag_to_table = CudfReadParquetPyarrowFS._fragments_to_cudf_dataframe + return CudfReadParquetPyarrowFS._table_to_pandas( + frag_to_table( + [frag[0] for frag in frag_filters], + frag_filters[0][1], # TODO: Check for consistent filters? + columns, + schema, + ), + **to_pandas_kwargs, + ) - return df - @staticmethod - def write_partition( - df, - path, - fs, - filename, - partition_on, - return_metadata, - fmd=None, - compression="snappy", - index_cols=None, - **kwargs, +class CudfFusedParquetIOHost(CudfFusedParquetIO): + @classmethod + def _load_multiple_files( + cls, + frag_filters, + columns, + schema, + **to_pandas_kwargs, ): - preserve_index = False - if len(index_cols) and set(index_cols).issubset(set(df.columns)): - df.set_index(index_cols, drop=True, inplace=True) - preserve_index = True - if partition_on: - md = write_to_dataset( - df=df, - root_path=path, - compression=compression, - filename=filename, - partition_cols=partition_on, - fs=fs, - preserve_index=preserve_index, - return_metadata=return_metadata, - statistics=kwargs.get("statistics", "ROWGROUP"), - int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get("row_group_size_bytes", None), - row_group_size_rows=kwargs.get("row_group_size_rows", None), - max_page_size_bytes=kwargs.get("max_page_size_bytes", None), - max_page_size_rows=kwargs.get("max_page_size_rows", None), - storage_options=kwargs.get("storage_options", None), - ) - else: - with fs.open(fs.sep.join([path, filename]), mode="wb") as out_file: - if not isinstance(out_file, IOBase): - out_file = BufferedWriter(out_file) - md = df.to_parquet( - path=out_file, - engine=kwargs.get("engine", "cudf"), - index=kwargs.get("index", None), - partition_cols=kwargs.get("partition_cols", None), - partition_file_name=kwargs.get( - "partition_file_name", None - ), - partition_offsets=kwargs.get("partition_offsets", None), - statistics=kwargs.get("statistics", "ROWGROUP"), - int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get( - "row_group_size_bytes", None - ), - row_group_size_rows=kwargs.get( - "row_group_size_rows", None - ), - storage_options=kwargs.get("storage_options", None), - metadata_file_path=filename if return_metadata else None, - ) - # Return the schema needed to write the metadata - if return_metadata: - return [{"meta": md}] - else: - return [] - - @staticmethod - def write_metadata(parts, fmd, fs, path, append=False, **kwargs): - if parts: - # Aggregate metadata and write to _metadata file - metadata_path = fs.sep.join([path, "_metadata"]) - _meta = [] - if append and fmd is not None: - _meta = [fmd] - _meta.extend([parts[i][0]["meta"] for i in range(len(parts))]) - _meta = ( - cudf.io.merge_parquet_filemetadata(_meta) - if len(_meta) > 1 - else _meta[0] + import pyarrow as pa + + from dask.base import apply, tokenize + from dask.threaded import get + + token = tokenize(frag_filters, columns, schema) + name = f"pq-file-{token}" + dsk = { + (name, i): ( + CudfReadParquetPyarrowFS._fragment_to_table, + frag, + filter, + columns, + schema, ) - with fs.open(metadata_path, "wb") as fil: - fil.write(memoryview(_meta)) - - @classmethod - def collect_file_metadata(cls, path, fs, file_path): - with fs.open(path, "rb") as f: - meta = pq.ParquetFile(f).metadata - if file_path: - meta.set_file_path(file_path) - with BytesIO() as myio: - meta.write_metadata_file(myio) - myio.seek(0) - meta = np.frombuffer(myio.read(), dtype="uint8") - return meta + for i, (frag, filter) in enumerate(frag_filters) + } + dsk[name] = ( + apply, + pa.concat_tables, + [list(dsk.keys())], + {"promote_options": "permissive"}, + ) - @classmethod - def aggregate_metadata(cls, meta_list, fs, out_path): - meta = ( - cudf.io.merge_parquet_filemetadata(meta_list) - if len(meta_list) > 1 - else meta_list[0] + return CudfReadParquetPyarrowFS._table_to_pandas( + get(dsk, name), + **to_pandas_kwargs, ) - if out_path: - metadata_path = fs.sep.join([out_path, "_metadata"]) - with fs.open(metadata_path, "wb") as fil: - fil.write(memoryview(meta)) - return None - else: - return meta - - -def set_object_dtypes_from_pa_schema(df, schema): - # Simple utility to modify cudf DataFrame - # "object" dtypes to agree with a specific - # pyarrow schema. - if schema: - for col_name, col in df._data.items(): - if col_name is None: - # Pyarrow cannot handle `None` as a field name. - # However, this should be a simple range index that - # we can ignore anyway - continue - typ = cudf_dtype_from_pa_type(schema.field(col_name).type) - if ( - col_name in schema.names - and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype)) - and isinstance(col, cudf.core.column.StringColumn) - ): - df._data[col_name] = col.astype(typ) -def read_parquet(path, columns=None, **kwargs): +def read_parquet_expr( + path, + *args, + columns=None, + filters=None, + categories=None, + index=None, + storage_options=None, + dtype_backend=None, + calculate_divisions=False, + ignore_metadata_file=False, + metadata_task_size=None, + split_row_groups="infer", + blocksize="default", + aggregate_files=None, + parquet_file_extension=(".parq", ".parquet", ".pq"), + filesystem="fsspec", + engine=None, + arrow_to_pandas=None, + open_file_options=None, + **kwargs, +): + """ + Read a Parquet file into a Dask-cuDF DataFrame. + + This reads a directory of Parquet data into a DataFrame collection. + Partitioning behavior mostly depends on the ``blocksize`` argument. + + .. note:: + Dask may automatically resize partitions at optimization time. + Please set ``blocksize=None`` to disable this behavior in Dask cuDF. + (NOTE: This will not disable fusion for the "pandas" backend) + + .. note:: + Specifying ``filesystem="arrow"`` leverages a complete reimplementation of + the Parquet reader that is solely based on PyArrow. It is faster than the + legacy implementation in some cases, but doesn't yet support all features. + + Parameters + ---------- + path : str or list + Source directory for data, or path(s) to individual parquet files. + Prefix with a protocol like ``s3://`` to read from alternative + filesystems. To read from multiple files you can pass a globstring or a + list of paths, with the caveat that they must all have the same + protocol. + columns : str or list, default None + Field name(s) to read in as columns in the output. By default all + non-index fields will be read (as determined by the pandas parquet + metadata, if present). Provide a single field name instead of a list to + read in the data as a Series. + filters : Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]], default None + List of filters to apply, like ``[[('col1', '==', 0), ...], ...]``. + Using this argument will result in row-wise filtering of the final partitions. + + Predicates can be expressed in disjunctive normal form (DNF). This means that + the inner-most tuple describes a single column predicate. These inner predicates + are combined with an AND conjunction into a larger predicate. The outer-most + list then combines all of the combined filters with an OR disjunction. + + Predicates can also be expressed as a ``List[Tuple]``. These are evaluated + as an AND conjunction. To express OR in predicates, one must use the + (preferred for "pyarrow") ``List[List[Tuple]]`` notation. + index : str, list or False, default None + Field name(s) to use as the output frame index. By default will be + inferred from the pandas parquet file metadata, if present. Use ``False`` + to read all fields as columns. + categories : list or dict, default None + For any fields listed here, if the parquet encoding is Dictionary, + the column will be created with dtype category. Use only if it is + guaranteed that the column is encoded as dictionary in all row-groups. + If a list, assumes up to 2**16-1 labels; if a dict, specify the number + of labels expected; if None, will load categories automatically for + data written by dask, not otherwise. + storage_options : dict, default None + Key/value pairs to be passed on to the file-system backend, if any. + Note that the default file-system backend can be configured with the + ``filesystem`` argument, described below. + calculate_divisions : bool, default False + Whether to use min/max statistics from the footer metadata (or global + ``_metadata`` file) to calculate divisions for the output DataFrame + collection. Divisions will not be calculated if statistics are missing. + This option will be ignored if ``index`` is not specified and there is + no physical index column specified in the custom "pandas" Parquet + metadata. Note that ``calculate_divisions=True`` may be extremely slow + when no global ``_metadata`` file is present, especially when reading + from remote storage. Set this to ``True`` only when known divisions + are needed for your workload (see :ref:`dataframe-design-partitions`). + ignore_metadata_file : bool, default False + Whether to ignore the global ``_metadata`` file (when one is present). + If ``True``, or if the global ``_metadata`` file is missing, the parquet + metadata may be gathered and processed in parallel. Parallel metadata + processing is currently supported for ``ArrowDatasetEngine`` only. + metadata_task_size : int, default configurable + If parquet metadata is processed in parallel (see ``ignore_metadata_file`` + description above), this argument can be used to specify the number of + dataset files to be processed by each task in the Dask graph. If this + argument is set to ``0``, parallel metadata processing will be disabled. + The default values for local and remote filesystems can be specified + with the "metadata-task-size-local" and "metadata-task-size-remote" + config fields, respectively (see "dataframe.parquet"). + split_row_groups : 'infer', 'adaptive', bool, or int, default 'infer' + WARNING: The ``split_row_groups`` argument is now deprecated, please use + ``blocksize`` instead. + + blocksize : int, float or str, default 'default' + The desired size of each output ``DataFrame`` partition in terms of total + (uncompressed) parquet storage space. This argument may be used to split + large files or aggregate small files into the same partition. Use ``None`` + for a simple 1:1 mapping between files and partitions. Use a float value + less than 1.0 to specify the fractional size of the partitions with + respect to the total memory of the first NVIDIA GPU on your machine. + Default is 1/32 the total memory of a single GPU. + aggregate_files : bool or str, default None + WARNING: The behavior of ``aggregate_files=True`` is now obsolete + when query-planning is enabled (the default). Small files are now + aggregated automatically according to the ``blocksize`` setting. + Please expect this argument to be deprecated in a future release. + + WARNING: Passing a string argument to ``aggregate_files`` will result + in experimental behavior that may be removed at any time. + + parquet_file_extension: str, tuple[str], or None, default (".parq", ".parquet", ".pq") + A file extension or an iterable of extensions to use when discovering + parquet files in a directory. Files that don't match these extensions + will be ignored. This argument only applies when ``paths`` corresponds + to a directory and no ``_metadata`` file is present (or + ``ignore_metadata_file=True``). Passing in ``parquet_file_extension=None`` + will treat all files in the directory as parquet files. + + The purpose of this argument is to ensure that the engine will ignore + unsupported metadata files (like Spark's '_SUCCESS' and 'crc' files). + It may be necessary to change this argument if the data files in your + parquet dataset do not end in ".parq", ".parquet", or ".pq". + filesystem: "fsspec", "arrow", or fsspec.AbstractFileSystem backend to use. + dataset: dict, default None + Dictionary of options to use when creating a ``pyarrow.dataset.Dataset`` object. + These options may include a "filesystem" key to configure the desired + file-system backend. However, the top-level ``filesystem`` argument will always + take precedence. + + **Note**: The ``dataset`` options may include a "partitioning" key. + However, since ``pyarrow.dataset.Partitioning`` + objects cannot be serialized, the value can be a dict of key-word + arguments for the ``pyarrow.dataset.partitioning`` API + (e.g. ``dataset={"partitioning": {"flavor": "hive", "schema": ...}}``). + Note that partitioned columns will not be converted to categorical + dtypes when a custom partitioning schema is specified in this way. + read: dict, default None + Dictionary of options to pass through to ``CudfEngine.read_partitions`` + using the ``read`` key-word argument. """ - Read parquet files into a :class:`.DataFrame`. - Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` - to coordinate the execution of :func:`cudf.read_parquet`, and to - ultimately create a :class:`.DataFrame` collection. + import dask_expr as dx + from fsspec.utils import stringify_path + from pyarrow import fs as pa_fs - See the :func:`dask.dataframe.read_parquet` documentation for - all available options. + from dask.core import flatten + from dask.dataframe.utils import pyarrow_strings_enabled - Examples - -------- - >>> from dask_cudf import read_parquet - >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP + from dask_cudf.backends import PYARROW_GE_15 - When dealing with one or more large parquet files having an - in-memory footprint >15% device memory, the ``split_row_groups`` - argument should be used to map Parquet **row-groups** to DataFrame - partitions (instead of **files** to partitions). For example, the - following code will map each row-group to a distinct partition: + if args: + raise ValueError(f"Unexpected positional arguments: {args}") - >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP + if open_file_options is not None: + raise ValueError( + "The open_file_options argument is no longer supported " + "by the 'cudf' backend." + ) + if dtype_backend is not None: + raise NotImplementedError( + "dtype_backend is not supported by the 'cudf' backend." + ) + if arrow_to_pandas is not None: + raise NotImplementedError( + "arrow_to_pandas is not supported by the 'cudf' backend." + ) + if engine not in (None, "cudf", CudfEngine): + raise NotImplementedError( + "engine={engine} is not supported by the 'cudf' backend." + ) - To map **multiple** row-groups to each partition, an integer can be - passed to ``split_row_groups`` to specify the **maximum** number of - row-groups allowed in each output partition: + if not isinstance(path, str): + path = stringify_path(path) - >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP + kwargs["dtype_backend"] = None + if arrow_to_pandas: + kwargs["arrow_to_pandas"] = None - See Also - -------- - cudf.read_parquet - dask.dataframe.read_parquet - """ - if isinstance(columns, str): - columns = [columns] + if filters is not None: + for filter in flatten(filters, container=list): + _, op, val = filter + if op == "in" and not isinstance(val, (set, list, tuple)): + raise TypeError( + "Value of 'in' filter must be a list, set or tuple." + ) + + # Normalize blocksize input + if blocksize == "default": + blocksize = _normalize_blocksize() + elif isinstance(blocksize, float) and blocksize < 1: + blocksize = _normalize_blocksize(blocksize) - # Set "check_file_size" option to determine whether we - # should check the parquet-file size. This check is meant - # to "protect" users from `split_row_groups` default changes - check_file_size = kwargs.pop("check_file_size", 500_000_000) if ( - check_file_size - and ("split_row_groups" not in kwargs) - and ("chunksize" not in kwargs) + isinstance(filesystem, pa_fs.FileSystem) + or isinstance(filesystem, str) + and filesystem.lower() in ("arrow", "pyarrow") ): - # User is not specifying `split_row_groups` or `chunksize`, - # so we should warn them if/when a file is ~>0.5GB on disk. - # They can set `split_row_groups` explicitly to silence/skip - # this check - if "read" not in kwargs: - kwargs["read"] = {} - kwargs["read"]["check_file_size"] = check_file_size + # EXPERIMENTAL filesystem="arrow" support. + # This code path may use PyArrow for remote IO. - return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) + # CudfReadParquetPyarrowFS requires import of distributed beforehand + # (See: https://github.com/dask/dask/issues/11352) + import distributed # noqa: F401 + if not PYARROW_GE_15: + raise ValueError( + "pyarrow>=15.0.0 is required to use the pyarrow filesystem." + ) + if metadata_task_size is not None: + warnings.warn( + "metadata_task_size is not supported when using the pyarrow filesystem." + " This argument will be ignored!" + ) + if aggregate_files is not None: + warnings.warn( + "aggregate_files is not supported when using the pyarrow filesystem." + " This argument will be ignored!" + ) + if split_row_groups != "infer": + warnings.warn( + "split_row_groups is not supported when using the pyarrow filesystem." + " This argument will be ignored!" + ) + if parquet_file_extension != (".parq", ".parquet", ".pq"): + raise NotImplementedError( + "parquet_file_extension is not supported when using the pyarrow filesystem." + ) -to_parquet = partial(dd.to_parquet, engine=CudfEngine) + return dx.new_collection( + NoOp( + CudfReadParquetPyarrowFS( + path, + columns=_convert_to_list(columns), + filters=filters, + categories=categories, + index=index, + calculate_divisions=calculate_divisions, + storage_options=storage_options, + filesystem=filesystem, + blocksize=blocksize, + ignore_metadata_file=ignore_metadata_file, + arrow_to_pandas=None, + pyarrow_strings_enabled=pyarrow_strings_enabled(), + kwargs=kwargs, + _series=isinstance(columns, str), + ), + ) + ) -if create_metadata_file_dd is None: - create_metadata_file = create_metadata_file_dd + return dx.new_collection( + NoOp( + CudfReadParquetFSSpec( + path, + columns=_convert_to_list(columns), + filters=filters, + categories=categories, + index=index, + blocksize=blocksize, + storage_options=storage_options, + calculate_divisions=calculate_divisions, + ignore_metadata_file=ignore_metadata_file, + metadata_task_size=metadata_task_size, + split_row_groups=split_row_groups, + aggregate_files=aggregate_files, + parquet_file_extension=parquet_file_extension, + filesystem=filesystem, + engine=CudfEngine, + kwargs=kwargs, + _series=isinstance(columns, str), + ), + ) + ) + + +if QUERY_PLANNING_ON: + read_parquet = read_parquet_expr + read_parquet.__doc__ = read_parquet_expr.__doc__ else: - create_metadata_file = partial(create_metadata_file_dd, engine=CudfEngine) + read_parquet = _deprecated_api( + "The legacy dask_cudf.io.parquet.read_parquet API", + new_api="dask_cudf.read_parquet", + rec="", + ) +to_parquet = _deprecated_api( + "dask_cudf.io.parquet.to_parquet", + new_api="dask_cudf._legacy.io.parquet.to_parquet", + rec="Please use the DataFrame.to_parquet method instead.", +) +create_metadata_file = _deprecated_api( + "dask_cudf.io.parquet.create_metadata_file", + new_api="dask_cudf._legacy.io.parquet.create_metadata_file", + rec="Please raise an issue if this feature is needed.", +) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index a35a9f1be48..a0acb86f5a9 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -264,3 +264,18 @@ def test_read_csv_nrows_error(csv_end_bad_lines): dask_cudf.read_csv( csv_end_bad_lines, nrows=2, blocksize="100 MiB" ).compute() + + +def test_deprecated_api_paths(tmp_path): + csv_path = str(tmp_path / "data-*.csv") + df = dask_cudf.DataFrame.from_dict({"a": range(100)}, npartitions=1) + df.to_csv(csv_path, index=False) + + # Encourage top-level read_csv import only + with pytest.warns(match="dask_cudf.io.read_csv is now deprecated"): + df2 = dask_cudf.io.read_csv(csv_path) + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="dask_cudf.io.csv.read_csv is now deprecated"): + df2 = dask_cudf.io.csv.read_csv(csv_path) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index abafbffd197..f5509cf91c3 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -126,3 +126,18 @@ def test_read_json_aggregate_files(tmp_path): assert name in df2.columns assert len(df2[name].compute().unique()) == df1.npartitions dd.assert_eq(df1, df2.drop(columns=[name]), check_index=False) + + +def test_deprecated_api_paths(tmp_path): + path = str(tmp_path / "data-*.json") + df = dd.from_dict({"a": range(100)}, npartitions=1) + df.to_json(path) + + # Encourage top-level read_json import only + with pytest.warns(match="dask_cudf.io.read_json is now deprecated"): + df2 = dask_cudf.io.read_json(path) + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"): + df2 = dask_cudf.io.json.read_json(path) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index 457e5546bd9..b6064d851ca 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -145,3 +145,21 @@ def test_to_orc(tmpdir, dtypes, compression, compute): # the cudf dataframes (df and df_read) dd.assert_eq(df, ddf_read) dd.assert_eq(df_read, ddf_read) + + +def test_deprecated_api_paths(tmpdir): + df = dask_cudf.DataFrame.from_dict({"a": range(100)}, npartitions=1) + path = tmpdir.join("test.orc") + # Top-level to_orc function is deprecated + with pytest.warns(match="dask_cudf.to_orc is now deprecated"): + dask_cudf.to_orc(df, path, write_index=False) + + # Encourage top-level read_orc import only + paths = glob.glob(str(path) + "/*.orc") + with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"): + df2 = dask_cudf.io.read_orc(paths) + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"): + df2 = dask_cudf.io.orc.read_orc(paths) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 620a917109e..6efe6c4f388 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -15,12 +15,17 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr +from dask_cudf._legacy.io.parquet import create_metadata_file +from dask_cudf.tests.utils import ( + require_dask_expr, + skip_dask_expr, + xfail_dask_expr, +) # Check if create_metadata_file is supported by # the current dask.dataframe version need_create_meta = pytest.mark.skipif( - dask_cudf.io.parquet.create_metadata_file is None, + create_metadata_file is None, reason="Need create_metadata_file support in dask.dataframe.", ) @@ -41,7 +46,7 @@ def test_roundtrip_backend_dispatch(tmpdir): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine="pyarrow") with dask.config.set({"dataframe.backend": "cudf"}): - ddf2 = dd.read_parquet(tmpdir, index=False) + ddf2 = dd.read_parquet(tmpdir, index=False, blocksize=None) assert isinstance(ddf2, dask_cudf.DataFrame) dd.assert_eq(ddf.reset_index(drop=False), ddf2) @@ -95,7 +100,7 @@ def test_roundtrip_from_dask_index_false(tmpdir): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine="pyarrow") - ddf2 = dask_cudf.read_parquet(tmpdir, index=False) + ddf2 = dask_cudf.read_parquet(tmpdir, index=False, blocksize=None) dd.assert_eq(ddf.reset_index(drop=False), ddf2) @@ -371,12 +376,12 @@ def test_split_row_groups(tmpdir, row_groups, index): row_group_size = 5 file_row_groups = 10 # Known apriori npartitions_expected = math.ceil(file_row_groups / row_groups) * 2 - + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "a": np.random.choice(["apple", "banana", "carrot"], size=df_size), - "b": np.random.random(size=df_size), - "c": np.random.randint(1, 5, size=df_size), + "a": rng.choice(["apple", "banana", "carrot"], size=df_size), + "b": rng.random(size=df_size), + "c": rng.integers(1, 5, size=df_size), "index": np.arange(0, df_size), } ) @@ -421,10 +426,14 @@ def test_create_metadata_file(tmpdir, partition_on): fns = glob.glob(os.path.join(tmpdir, partition_on + "=*/*.parquet")) else: fns = glob.glob(os.path.join(tmpdir, "*.parquet")) - dask_cudf.io.parquet.create_metadata_file( - fns, - split_every=3, # Force tree reduction - ) + + with pytest.warns( + match="dask_cudf.io.parquet.create_metadata_file is now deprecated" + ): + dask_cudf.io.parquet.create_metadata_file( + fns, + split_every=3, # Force tree reduction + ) # Check that we can now read the ddf # with the _metadata file present @@ -468,7 +477,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): # Add global metadata file. # Dask-CuDF can do this without requiring schema # consistency. - dask_cudf.io.parquet.create_metadata_file([p0, p1]) + create_metadata_file([p0, p1]) # Check that we can still read the ddf # with the _metadata file present @@ -529,9 +538,9 @@ def test_check_file_size(tmpdir): fn = str(tmpdir.join("test.parquet")) cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn) with pytest.warns(match="large parquet file"): - # Need to use `dask_cudf.io` path + # Need to use `dask_cudf._legacy.io` path # TODO: Remove outdated `check_file_size` functionality - dask_cudf.io.read_parquet(fn, check_file_size=1).compute() + dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute() @xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0") @@ -615,3 +624,70 @@ def test_timezone_column(tmpdir): got = dask_cudf.read_parquet(path) expect = cudf.read_parquet(path) dd.assert_eq(got, expect) + + +@require_dask_expr() +@pytest.mark.skipif( + not dask_cudf.backends.PYARROW_GE_15, + reason="Requires pyarrow 15", +) +@pytest.mark.parametrize("min_part_size", ["1B", "1GB"]) +def test_read_parquet_arrow_filesystem(tmpdir, min_part_size): + tmp_path = str(tmpdir) + with dask.config.set( + { + "dataframe.backend": "cudf", + "dataframe.parquet.minimum-partition-size": min_part_size, + } + ): + dd.from_dict( + {"x": range(1000), "y": ["a", "b", "c", "d"] * 250}, + npartitions=10, + ).to_parquet(tmp_path, write_index=False) + df = cudf.read_parquet(tmp_path) + ddf = dask_cudf.read_parquet(tmp_path, filesystem="arrow") + dd.assert_eq(df, ddf, check_index=False) + assert isinstance(ddf._meta, cudf.DataFrame) + assert isinstance(ddf.compute(), cudf.DataFrame) + + +@pytest.mark.parametrize("write_metadata_file", [True, False]) +def test_to_parquet_append(tmpdir, write_metadata_file): + df = cudf.DataFrame({"a": [1, 2, 3]}) + ddf = dask_cudf.from_cudf(df, npartitions=1) + ddf.to_parquet( + tmpdir, + append=True, + write_metadata_file=write_metadata_file, + write_index=False, + ) + ddf.to_parquet( + tmpdir, + append=True, + write_metadata_file=write_metadata_file, + write_index=False, + ) + ddf2 = dask_cudf.read_parquet(tmpdir, blocksize=None) + dd.assert_eq(cudf.concat([df, df]), ddf2) + + +def test_deprecated_api_paths(tmpdir): + df = dask_cudf.DataFrame.from_dict({"a": range(100)}, npartitions=1) + # io.to_parquet function is deprecated + with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"): + dask_cudf.io.to_parquet(df, tmpdir) + + if dask_cudf.QUERY_PLANNING_ON: + df2 = dask_cudf.io.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) + + df2 = dask_cudf.io.parquet.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) + else: + with pytest.warns(match="legacy dask_cudf.io.read_parquet"): + df2 = dask_cudf.io.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"): + df2 = dask_cudf.io.parquet.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index cf8af82e112..90907f6fb99 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -11,6 +11,8 @@ from dask.dataframe import assert_eq +import cudf + import dask_cudf from dask_cudf.tests.utils import QUERY_PLANNING_ON @@ -168,6 +170,8 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem): filesystem=filesystem, ) assert df.b.sum().compute() == 9 + assert isinstance(df._meta, cudf.DataFrame) + assert isinstance(df.compute(), cudf.DataFrame) def test_read_parquet_filesystem_explicit(s3_base, s3so, pdf): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index 8912b7d5da6..e35b6411a9d 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -34,3 +34,15 @@ def test_read_text_byte_range(offset, size): text_file, chunksize=None, delimiter=".", byte_range=(offset, size) ) dd.assert_eq(df1, df2, check_index=False) + + +def test_deprecated_api_paths(): + # Encourage top-level read_text import only + df = cudf.read_text(text_file, delimiter=".") + with pytest.warns(match="dask_cudf.io.read_text is now deprecated"): + df2 = dask_cudf.io.read_text(text_file, delimiter=".") + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"): + df2 = dask_cudf.io.text.read_text(text_file, delimiter=".") + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py index 9cdb7c5220b..1caf4e81d8e 100644 --- a/python/dask_cudf/dask_cudf/io/text.py +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -1,54 +1,8 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. -import os -from glob import glob +from dask_cudf import _deprecated_api -import dask.dataframe as dd -from dask.base import tokenize -from dask.utils import apply, parse_bytes - -import cudf - - -def read_text(path, chunksize="256 MiB", **kwargs): - if isinstance(chunksize, str): - chunksize = parse_bytes(chunksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - name = "read-text-" + tokenize(path, tokenize, **kwargs) - - if chunksize: - dsk = {} - i = 0 - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, chunksize): - kwargs1 = kwargs.copy() - kwargs1["byte_range"] = ( - start, - chunksize, - ) # specify which chunk of the file we care about - - dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1) - i += 1 - else: - dsk = { - (name, i): (apply, cudf.read_text, [fn], kwargs) - for i, fn in enumerate(filenames) - } - - meta = cudf.Series([], dtype="O") - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) +read_text = _deprecated_api( + "dask_cudf.io.text.read_text", + new_api="dask_cudf.read_text", +) diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini deleted file mode 100644 index 7b0a9f29fb1..00000000000 --- a/python/dask_cudf/dask_cudf/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 6f04b5737da..3fbb2aacd2c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -25,7 +25,7 @@ def data_dt_1(): def data_dt_2(): - return np.random.randn(100) + return np.random.default_rng(seed=0).standard_normal(size=100) dt_fields = ["year", "month", "day", "hour", "minute", "second"] diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py index 87bd401accd..8c51f950765 100644 --- a/python/dask_cudf/dask_cudf/tests/test_binops.py +++ b/python/dask_cudf/dask_cudf/tests/test_binops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import operator @@ -21,10 +21,11 @@ def _make_empty_frame(npartitions=2): def _make_random_frame_float(nelem, npartitions=2): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 5, size=nelem), - "y": np.random.normal(size=nelem) + 1, + "x": rng.integers(0, 5, size=nelem), + "y": rng.normal(size=nelem) + 1, } ) gdf = cudf.from_pandas(df) @@ -51,7 +52,6 @@ def _make_random_frame_float(nelem, npartitions=2): @pytest.mark.parametrize("binop", _binops) def test_series_binops_integer(binop): - np.random.seed(0) size = 1000 lhs_df, lhs_gdf = _make_random_frame(size) rhs_df, rhs_gdf = _make_random_frame(size) @@ -62,7 +62,6 @@ def test_series_binops_integer(binop): @pytest.mark.parametrize("binop", _binops) def test_series_binops_float(binop): - np.random.seed(0) size = 1000 lhs_df, lhs_gdf = _make_random_frame_float(size) rhs_df, rhs_gdf = _make_random_frame_float(size) @@ -73,10 +72,10 @@ def test_series_binops_float(binop): @pytest.mark.parametrize("operator", _binops) def test_df_series_bind_ops(operator): - np.random.seed(0) + rng = np.random.default_rng(seed=0) size = 1000 lhs_df, lhs_gdf = _make_random_frame_float(size) - rhs = np.random.rand() + rhs = rng.random() for col in lhs_gdf.columns: got = getattr(lhs_gdf[col], operator.__name__)(rhs) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 5f0fae86691..5130b804179 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -22,13 +22,15 @@ xfail_dask_expr, ) +rng = np.random.default_rng(seed=0) + def test_from_dict_backend_dispatch(): # Test ddf.from_dict cudf-backend dispatch - np.random.seed(0) + rng = np.random.default_rng(seed=0) data = { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } expect = cudf.DataFrame(data) with dask.config.set({"dataframe.backend": "cudf"}): @@ -37,35 +39,11 @@ def test_from_dict_backend_dispatch(): dd.assert_eq(expect, ddf) -def test_to_dask_dataframe_deprecated(): - gdf = cudf.DataFrame({"a": range(100)}) - ddf = dd.from_pandas(gdf, npartitions=2) - assert isinstance(ddf._meta, cudf.DataFrame) - - with pytest.warns(FutureWarning, match="API is now deprecated"): - assert isinstance( - ddf.to_dask_dataframe()._meta, - pd.DataFrame, - ) - - -def test_from_dask_dataframe_deprecated(): - gdf = pd.DataFrame({"a": range(100)}) - ddf = dd.from_pandas(gdf, npartitions=2) - assert isinstance(ddf._meta, pd.DataFrame) - - with pytest.warns(FutureWarning, match="API is now deprecated"): - assert isinstance( - dask_cudf.from_dask_dataframe(ddf)._meta, - cudf.DataFrame, - ) - - def test_to_backend(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) data = { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } with dask.config.set({"dataframe.backend": "pandas"}): ddf = dd.from_dict(data, npartitions=2) @@ -114,12 +92,12 @@ def test_to_backend_kwargs(): def test_from_pandas(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) @@ -169,10 +147,10 @@ def _fragmented_gdf(df, nsplit): def test_query(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} + {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)} ) gdf = cudf.DataFrame.from_pandas(df) expr = "x > 2" @@ -188,9 +166,9 @@ def test_query(): def test_query_local_dict(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} + {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)} ) gdf = cudf.DataFrame.from_pandas(df) ddf = dask_cudf.from_cudf(gdf, npartitions=2) @@ -204,11 +182,11 @@ def test_query_local_dict(): def test_head(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 5, size=100), - "y": np.random.normal(size=100), + "x": rng.integers(0, 5, size=100), + "y": rng.normal(size=100), } ) gdf = cudf.DataFrame.from_pandas(df) @@ -220,13 +198,11 @@ def test_head(): @pytest.mark.parametrize("nelem", [10, 200, 1333]) def test_set_index(nelem): with dask.config.set(scheduler="single-threaded"): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # Use unique index range as the sort may not be stable-ordering x = np.arange(nelem) - np.random.shuffle(x) - df = pd.DataFrame( - {"x": x, "y": np.random.randint(0, nelem, size=nelem)} - ) + rng.shuffle(x) + df = pd.DataFrame({"x": x, "y": rng.integers(0, nelem, size=nelem)}) ddf = dd.from_pandas(df, npartitions=2) ddf2 = ddf.to_backend("cudf") @@ -242,7 +218,7 @@ def test_set_index(nelem): def test_set_index_quantile(nelem, nparts, by): df = cudf.DataFrame() df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1]) - df["b"] = np.random.choice(cudf.datasets.names, size=nelem) + df["b"] = rng.choice(cudf.datasets.names, size=nelem) ddf = dd.from_pandas(df, npartitions=nparts) with pytest.warns(FutureWarning, match="deprecated"): @@ -270,11 +246,11 @@ def assert_frame_equal_by_index_group(expect, got): @pytest.mark.parametrize("nelem", [10, 200, 1333]) def test_set_index_2(nelem): with dask.config.set(scheduler="single-threaded"): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": 100 + np.random.randint(0, nelem // 2, size=nelem), - "y": np.random.normal(size=nelem), + "x": 100 + rng.integers(0, nelem // 2, size=nelem), + "y": rng.normal(size=nelem), } ) expect = df.set_index("x").sort_index() @@ -289,11 +265,11 @@ def test_set_index_2(nelem): def test_set_index_w_series(): with dask.config.set(scheduler="single-threaded"): nelem = 20 - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": 100 + np.random.randint(0, nelem // 2, size=nelem), - "y": np.random.normal(size=nelem), + "x": 100 + rng.integers(0, nelem // 2, size=nelem), + "y": rng.normal(size=nelem), } ) expect = df.set_index(df.x).sort_index() @@ -327,12 +303,12 @@ def test_set_index_sorted(): @pytest.mark.parametrize("index", [None, "myindex"]) def test_rearrange_by_divisions(nelem, index): with dask.config.set(scheduler="single-threaded"): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 20, size=nelem), - "y": np.random.normal(size=nelem), - "z": np.random.choice(["dog", "cat", "bird"], nelem), + "x": rng.integers(0, 20, size=nelem), + "y": rng.normal(size=nelem), + "z": rng.choice(["dog", "cat", "bird"], nelem), } ) df["z"] = df["z"].astype("category") @@ -355,9 +331,9 @@ def test_rearrange_by_divisions(nelem, index): def test_assign(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) @@ -372,10 +348,10 @@ def test_assign(): @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"]) def test_setitem_scalar_integer(data_type): - np.random.seed(0) - scalar = np.random.randint(0, 100, dtype=data_type) + rng = np.random.default_rng(seed=0) + scalar = rng.integers(0, 100, dtype=data_type) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) @@ -388,10 +364,10 @@ def test_setitem_scalar_integer(data_type): @pytest.mark.parametrize("data_type", ["float32", "float64"]) def test_setitem_scalar_float(data_type): - np.random.seed(0) - scalar = np.random.randn(1).astype(data_type)[0] + rng = np.random.default_rng(seed=0) + scalar = rng.standard_normal(size=1).astype(data_type)[0] df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) @@ -403,10 +379,10 @@ def test_setitem_scalar_float(data_type): def test_setitem_scalar_datetime(): - np.random.seed(0) - scalar = np.int64(np.random.randint(0, 100)).astype("datetime64[ms]") + rng = np.random.default_rng(seed=0) + scalar = np.int64(rng.integers(0, 100)).astype("datetime64[ms]") df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) @@ -422,12 +398,12 @@ def test_setitem_scalar_datetime(): "func", [ lambda: pd.DataFrame( - {"A": np.random.rand(10), "B": np.random.rand(10)}, + {"A": rng.random(10), "B": rng.random(10)}, index=list("abcdefghij"), ), lambda: pd.DataFrame( { - "A": np.random.rand(10), + "A": rng.random(10), "B": list("a" * 10), "C": pd.Series( [str(20090101 + i) for i in range(10)], @@ -438,7 +414,7 @@ def test_setitem_scalar_datetime(): ), lambda: pd.Series(list("abcdefghijklmnop")), lambda: pd.Series( - np.random.rand(10), + rng.random(10), index=pd.Index( [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]" ), @@ -497,10 +473,11 @@ def test_repartition_hash_staged(npartitions): by = ["b"] datarange = 35 size = 100 + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { "a": np.arange(size, dtype="int64"), - "b": np.random.randint(datarange, size=size), + "b": rng.integers(datarange, size=size), } ) # WARNING: Specific npartitions-max_branch combination @@ -537,12 +514,13 @@ def test_repartition_hash(by, npartitions, max_branch): npartitions_i = 4 datarange = 26 size = 100 + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { "a": np.arange(0, stop=size, dtype="int64"), - "b": np.random.randint(datarange, size=size), - "c": np.random.choice(list("abcdefgh"), size=size), - "d": np.random.choice(np.arange(26), size=size), + "b": rng.integers(datarange, size=size), + "c": rng.choice(list("abcdefgh"), size=size), + "d": rng.choice(np.arange(26), size=size), } ) gdf.d = gdf.d.astype("datetime64[ms]") @@ -768,6 +746,7 @@ def test_dataframe_series_replace(data): def test_dataframe_assign_col(): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame(list(range(100))) pdf = pd.DataFrame(list(range(100))) @@ -780,7 +759,7 @@ def test_dataframe_assign_col(): pddf = dd.from_pandas(pdf, npartitions=4) pddf["fold"] = 0 pddf["fold"] = pddf["fold"].map_partitions( - lambda p_df: pd.Series(np.random.randint(0, 4, len(p_df))) + lambda p_df: pd.Series(rng.integers(0, 4, len(p_df))) ) dd.assert_eq(ddf[0], pddf[0]) @@ -1015,10 +994,11 @@ def test_to_backend_simplify(): @pytest.mark.parametrize("numeric_only", [True, False]) @pytest.mark.parametrize("op", ["corr", "cov"]) def test_cov_corr(op, numeric_only): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame.from_dict( { - "x": np.random.randint(0, 5, size=10), - "y": np.random.normal(size=10), + "x": rng.integers(0, 5, size=10), + "y": rng.normal(size=10), } ) ddf = dd.from_pandas(df, npartitions=2) diff --git a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py index e6fb58ad6df..84ed3b46598 100644 --- a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py +++ b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py @@ -51,9 +51,13 @@ def test_series_from_delayed(): def test_dataframe_to_delayed(): nelem = 100 - df = cudf.DataFrame() - df["x"] = np.arange(nelem) - df["y"] = np.random.randint(nelem, size=nelem) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "x": np.arange(nelem), + "y": rng.integers(nelem, size=nelem), + } + ) ddf = dask_cudf.from_cudf(df, npartitions=5) @@ -80,8 +84,8 @@ def test_dataframe_to_delayed(): def test_series_to_delayed(): nelem = 100 - - sr = cudf.Series(np.random.randint(nelem, size=nelem)) + rng = np.random.default_rng(seed=0) + sr = cudf.Series(rng.integers(nelem, size=nelem)) dsr = dask_cudf.from_cudf(sr, npartitions=5) @@ -109,11 +113,13 @@ def test_series_to_delayed(): def test_mixing_series_frame_error(): nelem = 20 - - df = cudf.DataFrame() - df["x"] = np.arange(nelem) - df["y"] = np.random.randint(nelem, size=nelem) - + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "x": np.arange(nelem), + "y": rng.integers(nelem, size=nelem), + } + ) ddf = dask_cudf.from_cudf(df, npartitions=5) delay_frame = ddf.to_delayed() @@ -128,10 +134,13 @@ def test_mixing_series_frame_error(): def test_frame_extra_columns_error(): nelem = 20 - - df = cudf.DataFrame() - df["x"] = np.arange(nelem) - df["y"] = np.random.randint(nelem, size=nelem) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "x": np.arange(nelem), + "y": rng.integers(nelem, size=nelem), + } + ) ddf1 = dask_cudf.from_cudf(df, npartitions=5) df["z"] = np.arange(nelem) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index a12481a7bb4..fe57d4a4f00 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -32,8 +32,9 @@ def test_pyarrow_conversion_dispatch(preserve_index, index): to_pyarrow_table_dispatch, ) + rng = np.random.default_rng(seed=0) df1 = cudf.DataFrame( - np.random.randn(10, 3), columns=list("abc"), index=index + rng.standard_normal(size=(10, 3)), columns=list("abc"), index=index ) df2 = from_pyarrow_table_dispatch( df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index) @@ -108,7 +109,8 @@ def test_pyarrow_schema_dispatch(preserve_index): to_pyarrow_table_dispatch, ) - df = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc")) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame(rng.standard_normal(size=(10, 3)), columns=list("abc")) df["d"] = cudf.Series(["cat", "dog"] * 5) table = to_pyarrow_table_dispatch(df, preserve_index=preserve_index) schema = pyarrow_schema_dispatch(df, preserve_index=preserve_index) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index d03180852eb..c28b7e49207 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -4,7 +4,7 @@ import pytest import dask -from dask import dataframe as dd +from dask import array as da, dataframe as dd from dask.distributed import Client from distributed.utils_test import cleanup, loop, loop_in_thread # noqa: F401 @@ -121,3 +121,17 @@ def test_unique(): ddf.x.unique().compute(), check_index=False, ) + + +def test_serialization_of_numpy_types(): + # Dask uses numpy integers as column names, which can break cudf serialization + with dask_cuda.LocalCUDACluster(n_workers=1) as cluster: + with Client(cluster): + with dask.config.set( + {"dataframe.backend": "cudf", "array.backend": "cupy"} + ): + rng = da.random.default_rng() + X_arr = rng.random((100, 10), chunks=(50, 10)) + X = dd.from_dask_array(X_arr) + X = X[X.columns[0]] + X.compute() diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 7b9f0ca328a..918290aa6fa 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -13,8 +13,12 @@ from cudf.testing._utils import expect_warning_if import dask_cudf -from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized -from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr +from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized +from dask_cudf.tests.utils import ( + QUERY_PLANNING_ON, + require_dask_expr, + xfail_dask_expr, +) def assert_cudf_groupby_layers(ddf): @@ -30,21 +34,21 @@ def assert_cudf_groupby_layers(ddf): @pytest.fixture(params=["non_null", "null"]) def pdf(request): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # note that column name "x" is a substring of the groupby key; # this gives us coverage for cudf#10829 pdf = pd.DataFrame( { - "xx": np.random.randint(0, 5, size=10000), - "x": np.random.normal(size=10000), - "y": np.random.normal(size=10000), + "xx": rng.integers(0, 5, size=10000), + "x": rng.normal(size=10000), + "y": rng.normal(size=10000), } ) # insert nulls into dataframe at random if request.param == "null": - pdf = pdf.mask(np.random.choice([True, False], size=pdf.shape)) + pdf = pdf.mask(rng.choice([True, False], size=pdf.shape)) return pdf @@ -173,11 +177,12 @@ def test_groupby_agg_empty_partition(tmpdir, split_out): ], ) def test_groupby_multi_column(func): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "a": np.random.randint(0, 20, size=1000), - "b": np.random.randint(0, 5, size=1000), - "x": np.random.normal(size=1000), + "a": rng.integers(0, 20, size=1000), + "b": rng.integers(0, 5, size=1000), + "x": rng.normal(size=1000), } ) @@ -371,11 +376,12 @@ def test_groupby_string_index_name(myindex): ], ) def test_groupby_split_out_multiindex(agg_func): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame( { - "a": np.random.randint(0, 10, 100), - "b": np.random.randint(0, 5, 100), - "c": np.random.random(100), + "a": rng.integers(0, 10, 100), + "b": rng.integers(0, 5, 100), + "c": rng.random(100), } ) ddf = dask_cudf.from_cudf(df, 5) @@ -419,12 +425,13 @@ def test_groupby_multiindex_reset_index(npartitions): ], ) def test_groupby_reset_index_multiindex(groupby_keys, agg_func): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame( { - "a": np.random.randint(0, 10, 10), - "b": np.random.randint(0, 5, 10), - "c": np.random.randint(0, 5, 10), - "dd": np.random.randint(0, 5, 10), + "a": rng.integers(0, 10, 10), + "b": rng.integers(0, 5, 10), + "c": rng.integers(0, 5, 10), + "dd": rng.integers(0, 5, 10), } ) ddf = dask_cudf.from_cudf(df, 5) @@ -437,8 +444,9 @@ def test_groupby_reset_index_multiindex(groupby_keys, agg_func): def test_groupby_reset_index_drop_True(): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame( - {"a": np.random.randint(0, 10, 10), "b": np.random.randint(0, 5, 10)} + {"a": rng.integers(0, 10, 10), "b": rng.integers(0, 5, 10)} ) ddf = dask_cudf.from_cudf(df, 5) pddf = dd.from_pandas(df.to_pandas(), 5) @@ -552,10 +560,22 @@ def test_groupby_categorical_key(): ), ], ) +@pytest.mark.parametrize( + "fused", + [ + True, + pytest.param( + False, + marks=require_dask_expr("Not supported by legacy API"), + ), + ], +) @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2]) @pytest.mark.parametrize("split_every", [False, 4]) @pytest.mark.parametrize("npartitions", [1, 10]) -def test_groupby_agg_params(npartitions, split_every, split_out, as_index): +def test_groupby_agg_params( + npartitions, split_every, split_out, fused, as_index +): df = cudf.datasets.randomdata( nrows=150, dtypes={"name": str, "a": int, "b": int, "c": float}, @@ -570,6 +590,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): "c": ["mean", "std", "var"], } + fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {} split_kwargs = {"split_every": split_every, "split_out": split_out} if split_out == "use_dask_default": split_kwargs.pop("split_out") @@ -589,6 +610,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): ddf.groupby(["name", "a"], sort=True, **maybe_as_index) .aggregate( agg_dict, + **fused_kwarg, **split_kwargs, ) .compute() @@ -610,6 +632,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): # Full check (`sort=False`) gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate( agg_dict, + **fused_kwarg, **split_kwargs, ) pr = pddf.groupby(["name", "a"], sort=False).agg( @@ -653,10 +676,11 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): "aggregations", [(sum, "sum"), (max, "max"), (min, "min")] ) def test_groupby_agg_redirect(aggregations): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) @@ -758,10 +782,11 @@ def test_groupby_with_list_of_series(): ], ) def test_groupby_nested_dict(func): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) @@ -794,10 +819,11 @@ def test_groupby_nested_dict(func): ], ) def test_groupby_all_columns(func): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index 3e078c47cdd..61d0f8d7eb9 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -22,18 +22,18 @@ def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys): chunksize = 50 - np.random.seed(0) + rng = np.random.default_rng(seed=0) # cuDF left = cudf.DataFrame( { - "x": np.random.randint(0, left_nkeys, size=left_nrows), + "x": rng.integers(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows), } ) right = cudf.DataFrame( { - "x": np.random.randint(0, right_nkeys, size=right_nrows), + "x": rng.integers(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows), } ) @@ -84,18 +84,18 @@ def gather(df, grows): def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how): chunksize = 50 - np.random.seed(0) + rng = np.random.default_rng(seed=0) # cuDF left = cudf.DataFrame( { - "x": np.random.randint(0, left_nkeys, size=left_nrows), + "x": rng.integers(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows, dtype=np.float64), } ) right = cudf.DataFrame( { - "x": np.random.randint(0, right_nkeys, size=right_nrows), + "x": rng.integers(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows, dtype=np.float64), } ) @@ -153,20 +153,20 @@ def test_merge_left( ): chunksize = 3 - np.random.seed(0) + rng = np.random.default_rng(seed=42) # cuDF left = cudf.DataFrame( { - "x": np.random.randint(0, left_nkeys, size=left_nrows), - "y": np.random.randint(0, left_nkeys, size=left_nrows), + "x": rng.integers(0, left_nkeys, size=left_nrows), + "y": rng.integers(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows, dtype=np.float64), } ) right = cudf.DataFrame( { - "x": np.random.randint(0, right_nkeys, size=right_nrows), - "y": np.random.randint(0, right_nkeys, size=right_nrows), + "x": rng.integers(0, right_nkeys, size=right_nrows), + "y": rng.integers(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows, dtype=np.float64), } ) @@ -200,18 +200,18 @@ def test_merge_1col_left( ): chunksize = 3 - np.random.seed(0) + rng = np.random.default_rng(seed=0) # cuDF left = cudf.DataFrame( { - "x": np.random.randint(0, left_nkeys, size=left_nrows), + "x": rng.integers(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows, dtype=np.float64), } ) right = cudf.DataFrame( { - "x": np.random.randint(0, right_nkeys, size=right_nrows), + "x": rng.integers(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows, dtype=np.float64), } ) @@ -238,13 +238,19 @@ def test_merge_1col_left( def test_merge_should_fail(): # Expected failure cases described in #2694 - df1 = cudf.DataFrame() - df1["a"] = [1, 2, 3, 4, 5, 6] * 2 - df1["b"] = np.random.randint(0, 12, 12) - - df2 = cudf.DataFrame() - df2["a"] = [7, 2, 3, 8, 5, 9] * 2 - df2["c"] = np.random.randint(0, 12, 12) + rng = np.random.default_rng(seed=0) + df1 = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 2, + "b": rng.integers(0, 12, 12), + } + ) + df2 = pd.DataFrame( + { + "a": [7, 2, 3, 8, 5, 9] * 2, + "c": rng.integers(0, 12, 12), + } + ) left = dask_cudf.from_cudf(df1, 1).groupby("a").b.min().to_frame() right = dask_cudf.from_cudf(df2, 1).groupby("a").c.min().to_frame() @@ -257,7 +263,7 @@ def test_merge_should_fail(): left.merge(right, how="left", on=["c"]) # Same column names - df2["b"] = np.random.randint(0, 12, 12) + df2["b"] = np.random.default_rng(seed=0).integers(0, 12, 12) right = dask_cudf.from_cudf(df2, 1).groupby("a").b.min().to_frame() with pytest.raises(KeyError): diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index d03e92319be..f11a5252080 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -1,7 +1,5 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import numpy as np -import pandas as pd import pytest import dask @@ -10,20 +8,7 @@ import cudf import dask_cudf - - -def _make_random_frame(nelem, npartitions=2): - np.random.seed(0) - df = pd.DataFrame( - { - "x": np.random.randint(0, 5, size=nelem), - "y": np.random.normal(size=nelem) + 1, - } - ) - gdf = cudf.DataFrame.from_pandas(df) - dgf = dask_cudf.from_cudf(gdf, npartitions=npartitions) - return df, dgf - +from dask_cudf.tests.utils import _make_random_frame _reducers = ["sum", "count", "mean", "var", "std", "min", "max"] diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 9bbbbc79561..02c815427f3 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -28,7 +28,7 @@ @pytest.mark.parametrize("nelem", [10, 500]) @pytest.mark.parametrize("nparts", [1, 10]) def test_sort_values(nelem, nparts, by, ascending): - np.random.seed(0) + _ = np.random.default_rng(seed=0) df = cudf.DataFrame() df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1]) df["b"] = np.arange(100, nelem + 100) @@ -82,7 +82,7 @@ def test_sort_repartition(): ], ) def test_sort_values_with_nulls(data, by, ascending, na_position): - np.random.seed(0) + _ = np.random.default_rng(seed=0) cp.random.seed(0) df = cudf.DataFrame(data) ddf = dd.from_pandas(df, npartitions=5) diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index cc0c6899804..b44b3f939e7 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -10,7 +10,7 @@ import cudf -from dask_cudf.expr import QUERY_PLANNING_ON +from dask_cudf import QUERY_PLANNING_ON if QUERY_PLANNING_ON: DASK_VERSION = Version(dask.__version__) @@ -19,8 +19,9 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)} + {"x": rng.random(size=nelem), "y": rng.random(size=nelem)} ) if include_na: diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index a0b224c89a3..5dac70cc295 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -19,12 +19,13 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cudf==24.10.*", + "cudf==24.12.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", - "pandas>=2.0,<2.2.3dev0", - "rapids-dask-dependency==24.10.*", + "pandas>=2.0,<2.2.4dev0", + "pynvml>=11.4.1,<12.0.0a0", + "rapids-dask-dependency==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -45,8 +46,8 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ - "dask-cuda==24.10.*", - "numba>=0.57", + "dask-cuda==24.12.*,>=0.0.0a0", + "numba-cuda>=0.0.13,<0.0.18", "pytest-cov", "pytest-xdist", "pytest<8", @@ -69,59 +70,42 @@ version = {file = "dask_cudf/VERSION"} [tool.setuptools.packages.find] exclude = ["*tests*"] -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true - -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "cudf", -] -known_first_party = [ - "dask_cudf", -] +[tool.ruff] +extend = "../../pyproject.toml" -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["dask_cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "cudf"] + +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", ] +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error::FutureWarning", "error::DeprecationWarning", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning", + "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning:botocore", "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning", # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask", + # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437 + # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False` + "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning", ] +xfail_strict = true diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 0a8f5c4807d..5f9a04d3cee 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -22,6 +22,8 @@ project( LANGUAGES CXX ) +option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF) + # Check if cudf is already available. If so, it is the user's responsibility to ensure that the # CMake package is also available at build time of the Python cudf package. find_package(cudf "${RAPIDS_VERSION}") @@ -39,14 +41,20 @@ set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) +if(USE_NVCOMP_RUNTIME_WHEEL) + set(CUDF_EXPORT_NVCOMP OFF) +endif() set(CUDA_STATIC_RUNTIME ON) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) add_subdirectory(../../cpp cudf-cpp) -# Ensure other libraries needed by libcudf.so get installed alongside it. -include(cmake/Modules/WheelHelpers.cmake) -install_aliased_imported_targets( - TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} -) +if(USE_NVCOMP_RUNTIME_WHEEL) + set(rpaths "$ORIGIN/../../nvidia/nvcomp") + set_property( + TARGET cudf + PROPERTY INSTALL_RPATH ${rpaths} + APPEND + ) +endif() diff --git a/python/libcudf/cmake/Modules/WheelHelpers.cmake b/python/libcudf/cmake/Modules/WheelHelpers.cmake deleted file mode 100644 index 278d6751c15..00000000000 --- a/python/libcudf/cmake/Modules/WheelHelpers.cmake +++ /dev/null @@ -1,59 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= -include_guard(GLOBAL) - -# Making libraries available inside wheels by installing the associated targets. -function(install_aliased_imported_targets) - list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets") - - set(options "") - set(one_value "DESTINATION") - set(multi_value "TARGETS") - cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN}) - - message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'") - - foreach(target IN LISTS __TARGETS) - - if(NOT TARGET ${target}) - message(VERBOSE "No target named ${target}") - continue() - endif() - - get_target_property(alias_target ${target} ALIASED_TARGET) - if(alias_target) - set(target ${alias_target}) - endif() - - get_target_property(is_imported ${target} IMPORTED) - if(NOT is_imported) - # If the target isn't imported, install it into the wheel - install(TARGETS ${target} DESTINATION ${__DESTINATION}) - message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})") - else() - # If the target is imported, make sure it's global - get_target_property(type ${target} TYPE) - if(${type} STREQUAL "UNKNOWN_LIBRARY") - install(FILES $ DESTINATION ${__DESTINATION}) - message(VERBOSE "install(FILES $ DESTINATION ${__DESTINATION})") - else() - install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION}) - message( - VERBOSE - "install(IMPORTED_RUNTIME_ARTIFACTS $ DESTINATION ${__DESTINATION})" - ) - endif() - endif() - endforeach() -endfunction() diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py index ba134710868..c3ff5534e87 100644 --- a/python/libcudf/libcudf/load.py +++ b/python/libcudf/libcudf/load.py @@ -16,30 +16,75 @@ import ctypes import os +# Loading with RTLD_LOCAL adds the library itself to the loader's +# loaded library cache without loading any symbols into the global +# namespace. This allows libraries that express a dependency on +# this library to be loaded later and successfully satisfy this dependency +# without polluting the global symbol table with symbols from +# libcudf that could conflict with symbols from other DSOs. +PREFERRED_LOAD_FLAG = ctypes.RTLD_LOCAL + + +def _load_system_installation(soname: str): + """Try to dlopen() the library indicated by ``soname`` + Raises ``OSError`` if library cannot be loaded. + """ + return ctypes.CDLL(soname, PREFERRED_LOAD_FLAG) + + +def _load_wheel_installation(soname: str): + """Try to dlopen() the library indicated by ``soname`` + + Returns ``None`` if the library cannot be loaded. + """ + if os.path.isfile( + lib := os.path.join(os.path.dirname(__file__), "lib64", soname) + ): + return ctypes.CDLL(lib, PREFERRED_LOAD_FLAG) + return None + def load_library(): - # Dynamically load libcudf.so. Prefer a system library if one is present to - # avoid clobbering symbols that other packages might expect, but if no - # other library is present use the one in the wheel. - libcudf_lib = None + """Dynamically load libcudf.so and its dependencies""" try: - libcudf_lib = ctypes.CDLL("libcudf.so", ctypes.RTLD_GLOBAL) - except OSError: - # If neither of these directories contain the library, we assume we are in an - # environment where the C++ library is already installed somewhere else and the - # CMake build of the libcudf Python package was a no-op. - # - # Note that this approach won't work for real editable installs of the libcudf package. - # scikit-build-core has limited support for importlib.resources so there isn't a clean - # way to support that case yet. - for lib_dir in ("lib", "lib64"): - if os.path.isfile( - lib := os.path.join( - os.path.dirname(__file__), lib_dir, "libcudf.so" - ) - ): - libcudf_lib = ctypes.CDLL(lib, ctypes.RTLD_GLOBAL) - break + # libkvikio must be loaded before libcudf because libcudf references its symbols + import libkvikio + + libkvikio.load_library() + except ModuleNotFoundError: + # libcudf's runtime dependency on libkvikio may be satisfied by a natively + # installed library or a conda package, in which case the import will fail and + # we assume the library is discoverable on system paths. + pass + + prefer_system_installation = ( + os.getenv("RAPIDS_LIBCUDF_PREFER_SYSTEM_LIBRARY", "false").lower() + != "false" + ) + + soname = "libcudf.so" + libcudf_lib = None + if prefer_system_installation: + # Prefer a system library if one is present to + # avoid clobbering symbols that other packages might expect, but if no + # other library is present use the one in the wheel. + try: + libcudf_lib = _load_system_installation(soname) + except OSError: + libcudf_lib = _load_wheel_installation(soname) + else: + # Prefer the libraries bundled in this package. If they aren't found + # (which might be the case in builds where the library was prebuilt before + # packaging the wheel), look for a system installation. + try: + libcudf_lib = _load_wheel_installation(soname) + if libcudf_lib is None: + libcudf_lib = _load_system_installation(soname) + except OSError: + # If none of the searches above succeed, just silently return None + # and rely on other mechanisms (like RPATHs on other DSOs) to + # help the loader find the library. + pass # The caller almost never needs to do anything with this library, but no # harm in offering the option since this object at least provides a handle diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 22952e5b5bf..8c650eb2144 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -37,6 +37,10 @@ classifiers = [ "Programming Language :: C++", "Environment :: GPU :: NVIDIA CUDA", ] +dependencies = [ + "libkvikio==24.12.*,>=0.0.0a0", + "nvidia-nvcomp==4.1.0.6", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] Homepage = "https://github.com/rapidsai/cudf" @@ -44,6 +48,14 @@ Homepage = "https://github.com/rapidsai/cudf" [project.entry-points."cmake.prefix"] libcudf = "libcudf" +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 600 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '525M' + [tool.scikit-build] build-dir = "build/{wheel_tag}" cmake.build-type = "Release" @@ -66,7 +78,7 @@ dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", - "libkvikio==24.10.*", - "librmm==24.10.*", + "libkvikio==24.12.*,>=0.0.0a0", + "librmm==24.12.*,>=0.0.0a0", "ninja", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a7cb66d7b16..b1d9656afc2 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -17,6 +17,7 @@ set(cython_sources binaryop.pyx column.pyx column_factories.pyx + contiguous_split.pyx concatenate.pyx copying.pyx datetime.pyx @@ -25,8 +26,10 @@ set(cython_sources filling.pyx gpumemoryview.pyx groupby.pyx + hashing.pyx interop.pyx join.pyx + json.pyx labeling.pyx lists.pyx merge.pyx @@ -66,3 +69,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow) add_subdirectory(libcudf) add_subdirectory(strings) add_subdirectory(io) +add_subdirectory(nvtext) diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index a384edd456d..aa2ce957173 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -6,17 +6,22 @@ from . cimport ( binaryop, column_factories, concatenate, + contiguous_split, copying, datetime, experimental, expressions, filling, groupby, + hashing, + interop, join, + json, labeling, lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -50,6 +55,7 @@ __all__ = [ "aggregation", "binaryop", "column_factories", + "contiguous_split", "concatenate", "copying", "datetime", @@ -58,7 +64,10 @@ __all__ = [ "filling", "gpumemoryview", "groupby", + "hashing", + "interop", "join", + "json", "lists", "merge", "null_mask", @@ -78,4 +87,5 @@ __all__ = [ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 2a5365e8fad..62a2170f83e 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -15,19 +15,23 @@ binaryop, column_factories, concatenate, + contiguous_split, copying, datetime, experimental, expressions, filling, groupby, + hashing, interop, io, join, + json, labeling, lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -61,6 +65,7 @@ "aggregation", "binaryop", "column_factories", + "contiguous_split", "concatenate", "copying", "datetime", @@ -69,9 +74,11 @@ "filling", "gpumemoryview", "groupby", + "hashing", "interop", "io", "join", + "json", "labeling", "lists", "merge", @@ -92,4 +99,5 @@ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/aggregation.pyi b/python/pylibcudf/pylibcudf/aggregation.pyi new file mode 100644 index 00000000000..a59e2a9dc93 --- /dev/null +++ b/python/pylibcudf/pylibcudf/aggregation.pyi @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.types import ( + DataType, + Interpolation, + NanEquality, + NullEquality, + NullOrder, + NullPolicy, + Order, +) + +class Kind(IntEnum): + SUM = ... + PRODUCT = ... + MIN = ... + MAX = ... + COUNT_VALID = ... + COUNT_ALL = ... + ANY = ... + ALL = ... + SUM_OF_SQUARES = ... + MEAN = ... + VARIANCE = ... + STD = ... + MEDIAN = ... + QUANTILE = ... + ARGMAX = ... + ARGMIN = ... + NUNIQUE = ... + NTH_ELEMENT = ... + RANK = ... + COLLECT_LIST = ... + COLLECT_SET = ... + PTX = ... + CUDA = ... + CORRELATION = ... + COVARIANCE = ... + +class CorrelationType(IntEnum): + PEARSON = ... + KENDALL = ... + SPEARMAN = ... + +class EWMHistory(IntEnum): + INFINITE = ... + FINITE = ... + +class RankMethod(IntEnum): + FIRST = ... + AVERAGE = ... + MIN = ... + MAX = ... + DENSE = ... + +class RankPercentage(IntEnum): + NONE = ... + ZERO_NORMALIZED = ... + ONE_NORMALIZED = ... + +class UdfType(IntEnum): + CUDA = ... + PTX = ... + +class Aggregation: + def __init__(self): ... + def kind(self) -> Kind: ... + +def sum() -> Aggregation: ... +def product() -> Aggregation: ... +def min() -> Aggregation: ... +def max() -> Aggregation: ... +def count(null_handling: NullPolicy = NullPolicy.INCLUDE) -> Aggregation: ... +def any() -> Aggregation: ... +def all() -> Aggregation: ... +def sum_of_squares() -> Aggregation: ... +def mean() -> Aggregation: ... +def variance(ddof: int = 1) -> Aggregation: ... +def std(ddof: int = 1) -> Aggregation: ... +def median() -> Aggregation: ... +def quantile( + quantiles: list[float], interp: Interpolation = Interpolation.LINEAR +) -> Aggregation: ... +def argmax() -> Aggregation: ... +def argmin() -> Aggregation: ... +def ewma(center_of_mass: float, history: EWMHistory) -> Aggregation: ... +def nunique(null_handling: NullPolicy = NullPolicy.EXCLUDE) -> Aggregation: ... +def nth_element( + n: int, null_handling: NullPolicy = NullPolicy.INCLUDE +) -> Aggregation: ... +def collect_list( + null_handling: NullPolicy = NullPolicy.INCLUDE, +) -> Aggregation: ... +def collect_set( + null_handling: NullPolicy = NullPolicy.INCLUDE, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Aggregation: ... +def udf(operation: str, output_type: DataType) -> Aggregation: ... +def correlation(type: CorrelationType, min_periods: int) -> Aggregation: ... +def covariance(min_periods: int, ddof: int) -> Aggregation: ... +def rank( + method: RankMethod, + column_order: Order = Order.ASCENDING, + null_handling: NullPolicy = NullPolicy.EXCLUDE, + null_precedence: NullOrder = NullOrder.AFTER, + percentage: RankPercentage = RankPercentage.NONE, +) -> Aggregation: ... diff --git a/python/pylibcudf/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx index e510b738f70..662f76d5c8e 100644 --- a/python/pylibcudf/pylibcudf/aggregation.pyx +++ b/python/pylibcudf/pylibcudf/aggregation.pyx @@ -64,6 +64,40 @@ from pylibcudf.libcudf.aggregation import udf_type as UdfType # no-cython-lint from .types cimport DataType +__all__ = [ + "Aggregation", + "CorrelationType", + "EWMHistory", + "Kind", + "RankMethod", + "RankPercentage", + "UdfType", + "all", + "any", + "argmax", + "argmin", + "collect_list", + "collect_set", + "correlation", + "count", + "covariance", + "ewma", + "max", + "mean", + "median", + "min", + "nth_element", + "nunique", + "product", + "quantile", + "rank", + "std", + "sum", + "sum_of_squares", + "udf", + "variance", +] + cdef class Aggregation: """A type of aggregation to perform. diff --git a/python/pylibcudf/pylibcudf/binaryop.pyi b/python/pylibcudf/pylibcudf/binaryop.pyi new file mode 100644 index 00000000000..f745e6c6854 --- /dev/null +++ b/python/pylibcudf/pylibcudf/binaryop.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class BinaryOperator(IntEnum): + ADD = ... + SUB = ... + MUL = ... + DIV = ... + TRUE_DIV = ... + FLOOR_DIV = ... + MOD = ... + PMOD = ... + PYMOD = ... + POW = ... + INT_POW = ... + LOG_BASE = ... + ATAN2 = ... + SHIFT_LEFT = ... + SHIFT_RIGHT = ... + SHIFT_RIGHT_UNSIGNED = ... + BITWISE_AND = ... + BITWISE_OR = ... + BITWISE_XOR = ... + LOGICAL_AND = ... + LOGICAL_OR = ... + EQUAL = ... + NOT_EQUAL = ... + LESS = ... + GREATER = ... + LESS_EQUAL = ... + GREATER_EQUAL = ... + NULL_EQUALS = ... + NULL_MAX = ... + NULL_MIN = ... + NULL_NOT_EQUALS = ... + GENERIC_BINARY = ... + NULL_LOGICAL_AND = ... + NULL_LOGICAL_OR = ... + INVALID_BINARY = ... + +def binary_operation( + lhs: Column | Scalar, + rhs: Column | Scalar, + op: BinaryOperator, + output_type: DataType, +) -> Column: ... +def is_supported_operation( + out: DataType, lhs: DataType, rhs: DataType, op: BinaryOperator +) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index 5f9d145139a..b7b4ecc6e83 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -16,6 +16,7 @@ from .column cimport Column from .scalar cimport Scalar from .types cimport DataType +__all__ = ["BinaryOperator", "binary_operation", "is_supported_operation"] cpdef Column binary_operation( LeftBinaryOperand lhs, @@ -52,33 +53,27 @@ cpdef Column binary_operation( if LeftBinaryOperand is Column and RightBinaryOperand is Column: with nogil: - result = move( - cpp_binaryop.binary_operation( - lhs.view(), - rhs.view(), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + lhs.view(), + rhs.view(), + op, + output_type.c_obj ) elif LeftBinaryOperand is Column and RightBinaryOperand is Scalar: with nogil: - result = move( - cpp_binaryop.binary_operation( - lhs.view(), - dereference(rhs.c_obj), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + lhs.view(), + dereference(rhs.c_obj), + op, + output_type.c_obj ) elif LeftBinaryOperand is Scalar and RightBinaryOperand is Column: with nogil: - result = move( - cpp_binaryop.binary_operation( - dereference(lhs.c_obj), - rhs.view(), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + dereference(lhs.c_obj), + rhs.view(), + op, + output_type.c_obj ) else: raise ValueError(f"Invalid arguments {lhs} and {rhs}") @@ -106,6 +101,7 @@ cpdef bool is_supported_operation( The right hand side data type. op : BinaryOperator The operation to check. + Returns ------- bool diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi new file mode 100644 index 00000000000..c9f70de3dbf --- /dev/null +++ b/python/pylibcudf/pylibcudf/column.pyi @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Sequence +from typing import Any + +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class Column: + def __init__( + self, + data_type: DataType, + size: int, + data: gpumemoryview | None, + mask: gpumemoryview | None, + null_count: int, + offset: int, + children: list[Column], + ) -> None: ... + def type(self) -> DataType: ... + def child(self, index: int) -> Column: ... + def size(self) -> int: ... + def null_count(self) -> int: ... + def offset(self) -> int: ... + def data(self) -> gpumemoryview | None: ... + def null_mask(self) -> gpumemoryview | None: ... + def children(self) -> list[Column]: ... + def copy(self) -> Column: ... + def with_mask( + self, mask: gpumemoryview | None, null_count: int + ) -> Column: ... + def list_view(self) -> ListColumnView: ... + @staticmethod + def from_scalar(scalar: Scalar, size: int) -> Column: ... + @staticmethod + def all_null_like(like: Column, size: int) -> Column: ... + @staticmethod + def from_cuda_array_interface_obj(obj: Any) -> Column: ... + +class ListColumnView: + def __init__(self, column: Column): ... + def child(self) -> Column: ... + def offsets(self) -> Column: ... + +def is_c_contiguous( + shape: Sequence[int], strides: Sequence[int], itemsize: int +) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index a37a12fc7e1..9bb5574608e 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .gpumemoryview cimport gpumemoryview from .scalar cimport Scalar @@ -17,6 +17,7 @@ from .utils cimport int_to_bitmask_ptr, int_to_void_ptr import functools +__all__ = ["Column", "ListColumnView", "is_c_contiguous"] cdef class Column: """A container of nullable device data as a column of elements. @@ -61,6 +62,8 @@ cdef class Column: self._children = children self._num_children = len(children) + __hash__ = None + cdef column_view view(self) nogil: """Generate a libcudf column_view to pass to libcudf algorithms. @@ -138,7 +141,7 @@ cdef class Column: cdef size_type null_count = libcudf_col.get().null_count() - cdef column_contents contents = move(libcudf_col.get().release()) + cdef column_contents contents = libcudf_col.get().release() # Note that when converting to cudf Column objects we'll need to pull # out the base object. @@ -247,7 +250,7 @@ cdef class Column: cdef const scalar* c_scalar = slr.get() cdef unique_ptr[column] c_result with nogil: - c_result = move(make_column_from_scalar(dereference(c_scalar), size)) + c_result = make_column_from_scalar(dereference(c_scalar), size) return Column.from_libcudf(move(c_result)) @staticmethod @@ -269,7 +272,7 @@ cdef class Column: cdef Scalar slr = Scalar.empty_like(like) cdef unique_ptr[column] c_result with nogil: - c_result = move(make_column_from_scalar(dereference(slr.get()), size)) + c_result = make_column_from_scalar(dereference(slr.get()), size) return Column.from_libcudf(move(c_result)) @staticmethod @@ -373,7 +376,7 @@ cdef class Column: """Create a copy of the column.""" cdef unique_ptr[column] c_result with nogil: - c_result = move(make_unique[column](self.view())) + c_result = make_unique[column](self.view()) return Column.from_libcudf(move(c_result)) @@ -384,6 +387,8 @@ cdef class ListColumnView: raise TypeError("Column is not a list type") self._column = col + __hash__ = None + cpdef child(self): """The data column of the underlying list column.""" return self._column.child(1) diff --git a/python/pylibcudf/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd index fef02359240..d556085ab64 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/column_factories.pxd @@ -1,7 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from pylibcudf.libcudf.types cimport mask_state, size_type +from pylibcudf.libcudf.types cimport mask_state from .column cimport Column from .types cimport DataType, size_type, type_id diff --git a/python/pylibcudf/pylibcudf/column_factories.pyi b/python/pylibcudf/pylibcudf/column_factories.pyi new file mode 100644 index 00000000000..c87fe423acb --- /dev/null +++ b/python/pylibcudf/pylibcudf/column_factories.pyi @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column import Column +from pylibcudf.types import DataType, MaskState, TypeId + +def make_empty_column(type_or_id: DataType | TypeId) -> Column: ... +def make_numeric_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_fixed_point_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_timestamp_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_duration_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... +def make_fixed_width_column( + type_: DataType, size: int, mstate: MaskState +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index e9085e3ea02..c4969a7f502 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -17,6 +17,15 @@ from .types cimport DataType, type_id from .types import MaskState, TypeId +__all__ = [ + "make_duration_column", + "make_empty_column", + "make_fixed_point_column", + "make_fixed_width_column", + "make_numeric_column", + "make_timestamp_column", +] + cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): """Creates an empty column of the specified type. @@ -39,29 +48,17 @@ cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): if isinstance(type_or_id, TypeId): id = type_or_id with nogil: - result = move( - cpp_make_empty_column( - id - ) - ) + result = cpp_make_empty_column(id) else: raise TypeError( "Must pass a TypeId or DataType" ) elif MakeEmptyColumnOperand is DataType: with nogil: - result = move( - cpp_make_empty_column( - type_or_id.c_obj - ) - ) + result = cpp_make_empty_column(type_or_id.c_obj) elif MakeEmptyColumnOperand is type_id: with nogil: - result = move( - cpp_make_empty_column( - type_or_id - ) - ) + result = cpp_make_empty_column(type_or_id) else: raise TypeError( "Must pass a TypeId or DataType" @@ -92,12 +89,10 @@ cpdef Column make_numeric_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_numeric_column( - type_.c_obj, - size, - state - ) + result = cpp_make_numeric_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -121,12 +116,10 @@ cpdef Column make_fixed_point_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_fixed_point_column( - type_.c_obj, - size, - state - ) + result = cpp_make_fixed_point_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -151,12 +144,10 @@ cpdef Column make_timestamp_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_timestamp_column( - type_.c_obj, - size, - state - ) + result = cpp_make_timestamp_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -181,12 +172,10 @@ cpdef Column make_duration_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_duration_column( - type_.c_obj, - size, - state - ) + result = cpp_make_duration_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -211,12 +200,10 @@ cpdef Column make_fixed_width_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_fixed_width_column( - type_.c_obj, - size, - state - ) + result = cpp_make_fixed_width_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/concatenate.pyi b/python/pylibcudf/pylibcudf/concatenate.pyi new file mode 100644 index 00000000000..79076f509e0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/concatenate.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def concatenate[ColumnOrTable: (Column, Table)]( + objects: list[ColumnOrTable], +) -> ColumnOrTable: ... diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx index 8bdcc086e0f..42c5f34cf3e 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pyx +++ b/python/pylibcudf/pylibcudf/concatenate.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .column cimport Column from .table cimport Table +__all__ = ["concatenate"] cpdef concatenate(list objects): """Concatenate columns or tables. @@ -40,14 +41,14 @@ cpdef concatenate(list objects): c_tables.push_back((tbl).view()) with nogil: - c_tbl_result = move(cpp_concatenate.concatenate(c_tables)) + c_tbl_result = cpp_concatenate.concatenate(c_tables) return Table.from_libcudf(move(c_tbl_result)) elif isinstance(objects[0], Column): for column in objects: c_columns.push_back((column).view()) with nogil: - c_col_result = move(cpp_concatenate.concatenate(c_columns)) + c_col_result = cpp_concatenate.concatenate(c_columns) return Column.from_libcudf(move(c_col_result)) else: raise ValueError("input must be a list of Columns or Tables") diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd new file mode 100644 index 00000000000..3745e893c3e --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd @@ -0,0 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from pylibcudf.libcudf.contiguous_split cimport packed_columns + +from .gpumemoryview cimport gpumemoryview +from .table cimport Table + + +cdef class PackedColumns: + cdef unique_ptr[packed_columns] c_obj + + @staticmethod + cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data) + cpdef tuple release(self) + +cpdef PackedColumns pack(Table input) + +cpdef Table unpack(PackedColumns input) + +cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyi b/python/pylibcudf/pylibcudf/contiguous_split.pyi new file mode 100644 index 00000000000..dd6328fbf23 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.table import Table + +class PackedColumns: + def __init__(self): ... + def release(self) -> tuple[memoryview, gpumemoryview]: ... + +def pack(input: Table) -> PackedColumns: ... +def unpack(input: PackedColumns) -> Table: ... +def unpack_from_memoryviews( + metadata: memoryview, gpu_data: gpumemoryview +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx new file mode 100644 index 00000000000..2a40d42e6e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -0,0 +1,210 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libc.stdint cimport uint8_t +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.libcudf.contiguous_split cimport ( + pack as cpp_pack, + packed_columns, + unpack as cpp_unpack, +) +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.table.table_view cimport table_view + +from rmm.pylibrmm.device_buffer cimport DeviceBuffer + +from .gpumemoryview cimport gpumemoryview +from .table cimport Table +from .utils cimport int_to_void_ptr + + +__all__ = [ + "PackedColumns", + "pack", + "unpack", + "unpack_from_memoryviews", +] + +cdef class HostBuffer: + """Owning host buffer that implements the buffer protocol""" + cdef unique_ptr[vector[uint8_t]] c_obj + cdef size_t nbytes + cdef Py_ssize_t[1] shape + cdef Py_ssize_t[1] strides + + @staticmethod + cdef HostBuffer from_unique_ptr( + unique_ptr[vector[uint8_t]] vec + ): + cdef HostBuffer out = HostBuffer() + out.c_obj = move(vec) + out.nbytes = dereference(out.c_obj).size() + out.shape[0] = out.nbytes + out.strides[0] = 1 + return out + + __hash__ = None + + def __getbuffer__(self, Py_buffer *buffer, int flags): + buffer.buf = dereference(self.c_obj).data() + buffer.format = NULL # byte + buffer.internal = NULL + buffer.itemsize = 1 + buffer.len = self.nbytes + buffer.ndim = 1 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass + + +cdef class PackedColumns: + """Column data in a serialized format. + + Contains data from an array of columns in two contiguous buffers: + one on host, which contains table metadata and one on device which + contains the table data. + + For details, see :cpp:class:`cudf::packed_columns`. + """ + def __init__(self): + raise ValueError( + "PackedColumns should not be constructed directly. " + "Use one of the factories." + ) + + __hash__ = None + + @staticmethod + cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data): + """Create a Python PackedColumns from a libcudf packed_columns.""" + cdef PackedColumns out = PackedColumns.__new__(PackedColumns) + out.c_obj = move(data) + return out + + cpdef tuple release(self): + """Releases and returns the underlying serialized metadata and gpu data. + + The ownership of the memory are transferred to the returned buffers. After + this call, `self` is empty. + + Returns + ------- + memoryview (of a HostBuffer) + The serialized metadata as contiguous host memory. + gpumemoryview (of a rmm.DeviceBuffer) + The serialized gpu data as contiguous device memory. + """ + if not (dereference(self.c_obj).metadata and dereference(self.c_obj).gpu_data): + raise ValueError("Cannot release empty PackedColumns") + + return ( + memoryview( + HostBuffer.from_unique_ptr(move(dereference(self.c_obj).metadata)) + ), + gpumemoryview( + DeviceBuffer.c_from_unique_ptr(move(dereference(self.c_obj).gpu_data)) + ) + ) + + +cpdef PackedColumns pack(Table input): + """Deep-copy a table into a serialized contiguous memory format. + + Later use `unpack` or `unpack_from_memoryviews` to unpack the serialized + data back into the table. + + Examples + -------- + >>> packed = pylibcudf.contiguous_split.pack(...) + >>> # Either unpack the whole `PackedColumns` at once. + >>> pylibcudf.contiguous_split.unpack(packed) + >>> # Or unpack the two serialized buffers in `PackedColumns`. + >>> metadata, gpu_data = packed.release() + >>> pylibcudf.contiguous_split.unpack_from_memoryviews(metadata, gpu_data) + + For details, see :cpp:func:`cudf::pack`. + + Parameters + ---------- + input : Table + Table to pack. + + Returns + ------- + PackedColumns + The packed columns. + """ + return PackedColumns.from_libcudf( + make_unique[packed_columns](cpp_pack(input.view())) + ) + + +cpdef Table unpack(PackedColumns input): + """Deserialize the result of `pack`. + + Copies the result of a serialized table into a table. + Contrary to the libcudf C++ function, the returned table is a copy + of the serialized data. + + For details, see :cpp:func:`cudf::unpack`. + + Parameters + ---------- + input : PackedColumns + The packed columns to unpack. + + Returns + ------- + Table + Copy of the packed columns. + """ + cdef table_view v = cpp_unpack(dereference(input.c_obj)) + # Since `Table.from_table_view` doesn't support an arbitrary owning object, + # we copy the table, see . + cdef unique_ptr[table] t = make_unique[table](v) + return Table.from_libcudf(move(t)) + + +cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data): + """Deserialize the result of `pack`. + + Copies the result of a serialized table into a table. + Contrary to the libcudf C++ function, the returned table is a copy + of the serialized data. + + For details, see :cpp:func:`cudf::unpack`. + + Parameters + ---------- + metadata : memoryview + The packed metadata to unpack. + gpu_data : gpumemoryview + The packed gpu_data to unpack. + + Returns + ------- + Table + Copy of the packed columns. + """ + if metadata.nbytes == 0: + if gpu_data.__cuda_array_interface__["data"][0] != 0: + raise ValueError("Expected an empty gpu_data from unpacking an empty table") + return Table.from_libcudf(make_unique[table](table_view())) + + # Extract the raw data pointers + cdef const uint8_t[::1] _metadata = metadata + cdef const uint8_t* metadata_ptr = &_metadata[0] + cdef const uint8_t* gpu_data_ptr = int_to_void_ptr(gpu_data.ptr) + + cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr) + # Since `Table.from_table_view` doesn't support an arbitrary owning object, + # we copy the table, see . + cdef unique_ptr[table] t = make_unique[table](v) + return Table.from_libcudf(move(t)) diff --git a/python/pylibcudf/pylibcudf/copying.pyi b/python/pylibcudf/pylibcudf/copying.pyi new file mode 100644 index 00000000000..6cf4ed48724 --- /dev/null +++ b/python/pylibcudf/pylibcudf/copying.pyi @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum +from typing import TypeVar + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +class MaskAllocationPolicy(IntEnum): + NEVER = ... + RETAIN = ... + ALWAYS = ... + +class OutOfBoundsPolicy(IntEnum): + NULLIFY = ... + DONT_CHECK = ... + +ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) + +def gather( + source_table: Table, gather_map: Column, bounds_policy: OutOfBoundsPolicy +) -> Table: ... +def scatter( + source: Table | list[Scalar], scatter_map: Column, target_table: Table +) -> Table: ... +def empty_like(input: ColumnOrTable) -> ColumnOrTable: ... +def allocate_like( + input_column: Column, policy: MaskAllocationPolicy, size: int | None = None +) -> Column: ... +def copy_range_in_place( + input_column: Column, + target_column: Column, + input_begin: int, + input_end: int, + target_begin: int, +) -> Column: ... +def copy_range( + input_column: Column, + target_column: Column, + input_begin: int, + input_end: int, + target_begin: int, +) -> Column: ... +def shift(input: Column, offset: int, fill_value: Scalar) -> Column: ... +def slice(input: ColumnOrTable, indices: list[int]) -> list[ColumnOrTable]: ... +def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +def copy_if_else( + lhs: Column | Scalar, rhs: Column | Scalar, boolean_mask: Column +) -> Column: ... +def boolean_mask_scatter( + input: Table | list[Scalar], target: Table, boolean_mask: Column +) -> Table: ... +def get_element(input_column: Column, index: int) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx index 9743119d92a..fb8b6f9890e 100644 --- a/python/pylibcudf/pylibcudf/copying.pyx +++ b/python/pylibcudf/pylibcudf/copying.pyx @@ -36,6 +36,23 @@ from .table cimport Table from .utils cimport _as_vector +__all__ = [ + "MaskAllocationPolicy", + "OutOfBoundsPolicy", + "allocate_like", + "boolean_mask_scatter", + "copy_if_else", + "copy_range", + "copy_range_in_place", + "empty_like", + "gather", + "get_element", + "scatter", + "shift", + "slice", + "split", +] + cpdef Table gather( Table source_table, Column gather_map, @@ -67,13 +84,12 @@ cpdef Table gather( """ cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_copying.gather( - source_table.view(), - gather_map.view(), - bounds_policy - ) + c_result = cpp_copying.gather( + source_table.view(), + gather_map.view(), + bounds_policy ) + return Table.from_libcudf(move(c_result)) @@ -121,22 +137,18 @@ cpdef Table scatter( cdef vector[reference_wrapper[const scalar]] source_scalars if TableOrListOfScalars is Table: with nogil: - c_result = move( - cpp_copying.scatter( - source.view(), - scatter_map.view(), - target_table.view(), - ) + c_result = cpp_copying.scatter( + source.view(), + scatter_map.view(), + target_table.view(), ) else: source_scalars = _as_vector(source) with nogil: - c_result = move( - cpp_copying.scatter( - source_scalars, - scatter_map.view(), - target_table.view(), - ) + c_result = cpp_copying.scatter( + source_scalars, + scatter_map.view(), + target_table.view(), ) return Table.from_libcudf(move(c_result)) @@ -160,11 +172,11 @@ cpdef ColumnOrTable empty_like(ColumnOrTable input): cdef unique_ptr[column] c_col_result if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.empty_like(input.view())) + c_col_result = cpp_copying.empty_like(input.view()) return Column.from_libcudf(move(c_col_result)) else: with nogil: - c_tbl_result = move(cpp_copying.empty_like(input.view())) + c_tbl_result = cpp_copying.empty_like(input.view()) return Table.from_libcudf(move(c_tbl_result)) @@ -195,13 +207,11 @@ cpdef Column allocate_like( cdef size_type c_size = size if size is not None else input_column.size() with nogil: - c_result = move( - cpp_copying.allocate_like( + c_result = cpp_copying.allocate_like( input_column.view(), c_size, policy, ) - ) return Column.from_libcudf(move(c_result)) @@ -298,12 +308,12 @@ cpdef Column copy_range( cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_copying.copy_range( + c_result = cpp_copying.copy_range( input_column.view(), target_column.view(), input_begin, input_end, - target_begin) + target_begin ) return Column.from_libcudf(move(c_result)) @@ -337,13 +347,11 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_value): """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_copying.shift( + c_result = cpp_copying.shift( input.view(), offset, dereference(fill_value.c_obj) ) - ) return Column.from_libcudf(move(c_result)) @@ -378,7 +386,7 @@ cpdef list slice(ColumnOrTable input, list indices): cdef int i if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.slice(input.view(), c_indices)) + c_col_result = cpp_copying.slice(input.view(), c_indices) return [ Column.from_column_view(c_col_result[i], input) @@ -386,7 +394,7 @@ cpdef list slice(ColumnOrTable input, list indices): ] else: with nogil: - c_tbl_result = move(cpp_copying.slice(input.view(), c_indices)) + c_tbl_result = cpp_copying.slice(input.view(), c_indices) return [ Table.from_table_view(c_tbl_result[i], input) @@ -418,7 +426,7 @@ cpdef list split(ColumnOrTable input, list splits): if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.split(input.view(), c_splits)) + c_col_result = cpp_copying.split(input.view(), c_splits) return [ Column.from_column_view(c_col_result[i], input) @@ -426,7 +434,7 @@ cpdef list split(ColumnOrTable input, list splits): ] else: with nogil: - c_tbl_result = move(cpp_copying.split(input.view(), c_splits)) + c_tbl_result = cpp_copying.split(input.view(), c_splits) return [ Table.from_table_view(c_tbl_result[i], input) @@ -472,29 +480,25 @@ cpdef Column copy_if_else( if LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Column: with nogil: - result = move( - cpp_copying.copy_if_else(lhs.view(), rhs.view(), boolean_mask.view()) + result = cpp_copying.copy_if_else( + lhs.view(), + rhs.view(), + boolean_mask.view() ) elif LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Scalar: with nogil: - result = move( - cpp_copying.copy_if_else( - lhs.view(), dereference(rhs.c_obj), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + lhs.view(), dereference(rhs.c_obj), boolean_mask.view() ) elif LeftCopyIfElseOperand is Scalar and RightCopyIfElseOperand is Column: with nogil: - result = move( - cpp_copying.copy_if_else( - dereference(lhs.c_obj), rhs.view(), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + dereference(lhs.c_obj), rhs.view(), boolean_mask.view() ) else: with nogil: - result = move( - cpp_copying.copy_if_else( - dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view() ) return Column.from_libcudf(move(result)) @@ -541,22 +545,18 @@ cpdef Table boolean_mask_scatter( if TableOrListOfScalars is Table: with nogil: - result = move( - cpp_copying.boolean_mask_scatter( - input.view(), - target.view(), - boolean_mask.view() - ) + result = cpp_copying.boolean_mask_scatter( + input.view(), + target.view(), + boolean_mask.view() ) else: source_scalars = _as_vector(input) with nogil: - result = move( - cpp_copying.boolean_mask_scatter( - source_scalars, - target.view(), - boolean_mask.view(), - ) + result = cpp_copying.boolean_mask_scatter( + source_scalars, + target.view(), + boolean_mask.view(), ) return Table.from_libcudf(move(result)) @@ -586,8 +586,6 @@ cpdef Scalar get_element(Column input_column, size_type index): """ cdef unique_ptr[scalar] c_output with nogil: - c_output = move( - cpp_copying.get_element(input_column.view(), index) - ) + c_output = cpp_copying.get_element(input_column.view(), index) return Scalar.from_libcudf(move(c_output)) diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd index 2fce48cf1b4..335ef435f9b 100644 --- a/python/pylibcudf/pylibcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/datetime.pxd @@ -1,8 +1,56 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from .column cimport Column +from pylibcudf.column cimport Column +from pylibcudf.libcudf.datetime cimport datetime_component, rounding_frequency +from pylibcudf.scalar cimport Scalar +ctypedef fused ColumnOrScalar: + Column + Scalar -cpdef Column extract_year( - Column col +cpdef Column extract_millisecond_fraction( + Column input ) + +cpdef Column extract_microsecond_fraction( + Column input +) + +cpdef Column extract_nanosecond_fraction( + Column input +) + +cpdef Column extract_datetime_component( + Column input, + datetime_component component +) + +cpdef Column ceil_datetimes( + Column input, + rounding_frequency freq +) + +cpdef Column floor_datetimes( + Column input, + rounding_frequency freq +) + +cpdef Column round_datetimes( + Column input, + rounding_frequency freq +) + +cpdef Column add_calendrical_months( + Column timestamps, + ColumnOrScalar months, +) + +cpdef Column day_of_year(Column input) + +cpdef Column is_leap_year(Column input) + +cpdef Column last_day_of_month(Column input) + +cpdef Column extract_quarter(Column input) + +cpdef Column days_in_month(Column input) diff --git a/python/pylibcudf/pylibcudf/datetime.pyi b/python/pylibcudf/pylibcudf/datetime.pyi new file mode 100644 index 00000000000..6a3ae7953d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/datetime.pyi @@ -0,0 +1,45 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class DatetimeComponent(IntEnum): + YEAR = ... + MONTH = ... + DAY = ... + WEEKDAY = ... + HOUR = ... + MINUTE = ... + SECOND = ... + MILLISECOND = ... + MICROSECOND = ... + NANOSECOND = ... + +class RoundingFrequency(IntEnum): + DAY = ... + HOUR = ... + MINUTE = ... + SECOND = ... + MILLISECOND = ... + MICROSECOND = ... + NANOSECOND = ... + +def extract_millisecond_fraction(input: Column) -> Column: ... +def extract_microsecond_fraction(input: Column) -> Column: ... +def extract_nanosecond_fraction(input: Column) -> Column: ... +def extract_datetime_component( + input: Column, component: DatetimeComponent +) -> Column: ... +def ceil_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def floor_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def round_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... +def add_calendrical_months( + input: Column, months: Column | Scalar +) -> Column: ... +def day_of_year(input: Column) -> Column: ... +def is_leap_year(input: Column) -> Column: ... +def last_day_of_month(input: Column) -> Column: ... +def extract_quarter(input: Column) -> Column: ... +def days_in_month(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index e8e0caaf42d..b100e3e22d0 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -3,79 +3,377 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.datetime cimport ( + add_calendrical_months as cpp_add_calendrical_months, + ceil_datetimes as cpp_ceil_datetimes, + datetime_component, day_of_year as cpp_day_of_year, - extract_day as cpp_extract_day, - extract_hour as cpp_extract_hour, + days_in_month as cpp_days_in_month, + extract_datetime_component as cpp_extract_datetime_component, extract_microsecond_fraction as cpp_extract_microsecond_fraction, extract_millisecond_fraction as cpp_extract_millisecond_fraction, - extract_minute as cpp_extract_minute, - extract_month as cpp_extract_month, extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, - extract_second as cpp_extract_second, - extract_weekday as cpp_extract_weekday, - extract_year as cpp_extract_year, + extract_quarter as cpp_extract_quarter, + floor_datetimes as cpp_floor_datetimes, + is_leap_year as cpp_is_leap_year, + last_day_of_month as cpp_last_day_of_month, + round_datetimes as cpp_round_datetimes, + rounding_frequency, ) +from pylibcudf.libcudf.datetime import \ + datetime_component as DatetimeComponent # no-cython-lint +from pylibcudf.libcudf.datetime import \ + rounding_frequency as RoundingFrequency # no-cython-lint + +from cython.operator cimport dereference + from .column cimport Column +__all__ = [ + "DatetimeComponent", + "RoundingFrequency", + "add_calendrical_months", + "ceil_datetimes", + "day_of_year", + "days_in_month", + "extract_datetime_component", + "extract_microsecond_fraction", + "extract_millisecond_fraction", + "extract_nanosecond_fraction", + "extract_quarter", + "floor_datetimes", + "is_leap_year", + "last_day_of_month", + "round_datetimes", +] + +cpdef Column extract_millisecond_fraction( + Column input +): + """ + Extract the millisecond from a datetime column. + + For details, see :cpp:func:`extract_millisecond_fraction`. + + Parameters + ---------- + input : Column + The column to extract the millisecond from. + + Returns + ------- + Column + Column with the extracted milliseconds. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_extract_millisecond_fraction(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column extract_microsecond_fraction( + Column input +): + """ + Extract the microsecond fraction from a datetime column. + + For details, see :cpp:func:`extract_microsecond_fraction`. + + Parameters + ---------- + input : Column + The column to extract the microsecond fraction from. + + Returns + ------- + Column + Column with the extracted microsecond fractions. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_extract_microsecond_fraction(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column extract_nanosecond_fraction( + Column input +): + """ + Extract the nanosecond fraction from a datetime column. + + For details, see :cpp:func:`extract_nanosecond_fraction`. + + Parameters + ---------- + input : Column + The column to extract the nanosecond fraction from. + + Returns + ------- + Column + Column with the extracted nanosecond fractions. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_extract_nanosecond_fraction(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column extract_datetime_component( + Column input, + datetime_component component +): + """ + Extract a datetime component from a datetime column. + + For details, see :cpp:func:`cudf::extract_datetime_component`. + + Parameters + ---------- + input : Column + The column to extract the component from. + component : DatetimeComponent + The datetime component to extract. + + Returns + ------- + Column + Column with the extracted component. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_extract_datetime_component(input.view(), component) + return Column.from_libcudf(move(result)) + +cpdef Column ceil_datetimes( + Column input, + rounding_frequency freq +): + """ + Round datetimes up to the nearest multiple of the given frequency. + + For details, see :cpp:func:`ceil_datetimes`. + + Parameters + ---------- + input : Column + The column of input datetime values. + freq : rounding_frequency + The frequency to round up to. + + Returns + ------- + Column + Column of the same datetime resolution as the input column. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_ceil_datetimes(input.view(), freq) + return Column.from_libcudf(move(result)) + +cpdef Column floor_datetimes( + Column input, + rounding_frequency freq +): + """ + Round datetimes down to the nearest multiple of the given frequency. + + For details, see :cpp:func:`floor_datetimes`. + + Parameters + ---------- + input : Column + The column of input datetime values. + freq : rounding_frequency + The frequency to round down to. + + Returns + ------- + Column + Column of the same datetime resolution as the input column. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_floor_datetimes(input.view(), freq) + return Column.from_libcudf(move(result)) + +cpdef Column round_datetimes( + Column input, + rounding_frequency freq +): + """ + Round datetimes to the nearest multiple of the given frequency. + + For details, see :cpp:func:`round_datetimes`. + + Parameters + ---------- + input : Column + The column of input datetime values. + freq : rounding_frequency + The frequency to round to. + + Returns + ------- + Column + Column of the same datetime resolution as the input column. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_round_datetimes(input.view(), freq) + return Column.from_libcudf(move(result)) -cpdef Column extract_year( - Column values +cpdef Column add_calendrical_months( + Column input, + ColumnOrScalar months, ): """ - Extract the year from a datetime column. + Adds or subtracts a number of months from the datetime + type and returns a timestamp column that is of the same + type as the input timestamps column. + + For details, see :cpp:func:`add_calendrical_months`. Parameters ---------- - values : Column - The column to extract the year from. + input : Column + The column of input timestamp values. + months : ColumnOrScalar + The number of months to add. Returns ------- Column - Column with the extracted years. + Column of computed timestamps. """ + if not isinstance(months, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + cdef unique_ptr[column] result with nogil: - result = move(cpp_extract_year(values.view())) + result = cpp_add_calendrical_months( + input.view(), + months.view() if ColumnOrScalar is Column else + dereference(months.get()) + ) return Column.from_libcudf(move(result)) +cpdef Column day_of_year(Column input): + """ + Computes the day number since the start of + the year from the datetime. The value is between + [1, {365-366}]. -def extract_datetime_component(Column col, str field): + For details, see :cpp:func:`day_of_year`. - cdef unique_ptr[column] c_result + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column of day numbers. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_day_of_year(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column is_leap_year(Column input): + """ + Check if the year of the given date is a leap year. + + For details, see :cpp:func:`is_leap_year`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column of bools indicating whether the given year + is a leap year. + """ + cdef unique_ptr[column] result with nogil: - if field == "year": - c_result = move(cpp_extract_year(col.view())) - elif field == "month": - c_result = move(cpp_extract_month(col.view())) - elif field == "day": - c_result = move(cpp_extract_day(col.view())) - elif field == "weekday": - c_result = move(cpp_extract_weekday(col.view())) - elif field == "hour": - c_result = move(cpp_extract_hour(col.view())) - elif field == "minute": - c_result = move(cpp_extract_minute(col.view())) - elif field == "second": - c_result = move(cpp_extract_second(col.view())) - elif field == "millisecond": - c_result = move( - cpp_extract_millisecond_fraction(col.view()) - ) - elif field == "microsecond": - c_result = move( - cpp_extract_microsecond_fraction(col.view()) - ) - elif field == "nanosecond": - c_result = move( - cpp_extract_nanosecond_fraction(col.view()) - ) - elif field == "day_of_year": - c_result = move(cpp_day_of_year(col.view())) - else: - raise ValueError(f"Invalid datetime field: '{field}'") - - return Column.from_libcudf(move(c_result)) + result = cpp_is_leap_year(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column last_day_of_month(Column input): + """ + Computes the last day of the month. + + For details, see :cpp:func:`last_day_of_month`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column of ``TIMESTAMP_DAYS`` representing the last day + of the month. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_last_day_of_month(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column extract_quarter(Column input): + """ + Returns the quarter (ie. a value from {1, 2, 3, 4}) + that the date is in. + + For details, see :cpp:func:`extract_quarter`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column indicating which quarter the date is in. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_extract_quarter(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column days_in_month(Column input): + """ + Extract the number of days in the month. + + For details, see :cpp:func:`days_in_month`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column of the number of days in the given month. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_days_in_month(input.view()) + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/experimental.pyi b/python/pylibcudf/pylibcudf/experimental.pyi new file mode 100644 index 00000000000..bbfb86b0ff6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/experimental.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +def enable_prefetching(key: str) -> None: ... +def disable_prefetching(key: str) -> None: ... +def prefetch_debugging(enable: bool) -> None: ... diff --git a/python/pylibcudf/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx index b25a53e13b2..d94d6d087ac 100644 --- a/python/pylibcudf/pylibcudf/experimental.pyx +++ b/python/pylibcudf/pylibcudf/experimental.pyx @@ -5,6 +5,8 @@ from libcpp.string cimport string from pylibcudf.libcudf cimport experimental as cpp_experimental +__all__ = ["disable_prefetching", "enable_prefetching", "prefetch_debugging"] + cpdef enable_prefetching(str key): """Turn on prefetch instructions for the given key. diff --git a/python/pylibcudf/pylibcudf/expressions.pyi b/python/pylibcudf/pylibcudf/expressions.pyi new file mode 100644 index 00000000000..4dcccaaa1fc --- /dev/null +++ b/python/pylibcudf/pylibcudf/expressions.pyi @@ -0,0 +1,81 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +from pylibcudf.scalar import Scalar + +class TableReference(IntEnum): + LEFT = ... + RIGHT = ... + +class ASTOperator(IntEnum): + ADD = ... + SUB = ... + MUL = ... + DIV = ... + TRUE_DIV = ... + FLOOR_DIV = ... + MOD = ... + PYMOD = ... + POW = ... + EQUAL = ... + NULL_EQUAL = ... + NOT_EQUAL = ... + LESS = ... + GREATER = ... + LESS_EQUAL = ... + GREATER_EQUAL = ... + BITWISE_AND = ... + BITWISE_OR = ... + BITWISE_XOR = ... + NULL_LOGICAL_AND = ... + LOGICAL_AND = ... + NULL_LOGICAL_OR = ... + LOGICAL_OR = ... + IDENTITY = ... + IS_NULL = ... + SIN = ... + COS = ... + TAN = ... + ARCSIN = ... + ARCCOS = ... + ARCTAN = ... + SINH = ... + COSH = ... + TANH = ... + ARCSINH = ... + ARCCOSH = ... + ARCTANH = ... + EXP = ... + LOG = ... + SQRT = ... + CBRT = ... + CEIL = ... + FLOOR = ... + ABS = ... + RINT = ... + BIT_INVERT = ... + NOT = ... + +class Expression: + def __init__(self): ... + +class Literal(Expression): + def __init__(self, value: Scalar): ... + +class ColumnReference(Expression): + def __init__( + self, index: int, table_source: TableReference = TableReference.LEFT + ): ... + +class ColumnNameReference(Expression): + def __init__(self, name: str): ... + +class Operation(Expression): + def __init__( + self, + op: ASTOperator, + left: Expression, + right: Expression | None = None, + ): ... + +def to_expression(expr: str, column_names: tuple[str, ...]) -> Expression: ... diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx index a44c9e25987..31121785e27 100644 --- a/python/pylibcudf/pylibcudf/expressions.pyx +++ b/python/pylibcudf/pylibcudf/expressions.pyx @@ -1,11 +1,26 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import ast +import functools + +import pyarrow as pa + from pylibcudf.libcudf.expressions import \ ast_operator as ASTOperator # no-cython-lint from pylibcudf.libcudf.expressions import \ table_reference as TableReference # no-cython-lint from cython.operator cimport dereference -from libc.stdint cimport int32_t, int64_t +from libc.stdint cimport ( + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) +from libcpp cimport bool from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -18,12 +33,14 @@ from pylibcudf.libcudf.scalar.scalar cimport ( ) from pylibcudf.libcudf.types cimport size_type, type_id from pylibcudf.libcudf.wrappers.durations cimport ( + duration_D, duration_ms, duration_ns, duration_s, duration_us, ) from pylibcudf.libcudf.wrappers.timestamps cimport ( + timestamp_D, timestamp_ms, timestamp_ns, timestamp_s, @@ -34,9 +51,22 @@ from .scalar cimport Scalar from .traits cimport is_chrono, is_numeric from .types cimport DataType +from .interop import from_arrow + # Aliases for simplicity ctypedef unique_ptr[libcudf_exp.expression] expression_ptr +__all__ = [ + "ASTOperator", + "ColumnNameReference", + "ColumnReference", + "Expression", + "Literal", + "Operation", + "TableReference", + "to_expression" +] + # Define this class just to have a docstring for it cdef class Expression: """ @@ -46,7 +76,7 @@ cdef class Expression: For details, see :cpp:class:`cudf::ast::expression`. """ - pass + __hash__ = None cdef class Literal(Expression): """ @@ -78,6 +108,34 @@ cdef class Literal(Expression): self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) )) + elif tid == type_id.INT16: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.INT8: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.UINT64: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.UINT32: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.UINT16: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.UINT8: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.BOOL8: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) elif tid == type_id.FLOAT64: self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) @@ -110,6 +168,10 @@ cdef class Literal(Expression): self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) )) + elif tid == type_id.TIMESTAMP_DAYS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) elif tid == type_id.DURATION_NANOSECONDS: self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) @@ -130,6 +192,10 @@ cdef class Literal(Expression): self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) )) + elif tid == type_id.DURATION_DAYS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) else: raise NotImplementedError( f"Don't know how to make literal with type id {tid}" @@ -203,3 +269,217 @@ cdef class ColumnNameReference(Expression): move(make_unique[libcudf_exp.column_name_reference]( (name.encode("utf-8")) )) + + +# This dictionary encodes the mapping from Python AST operators to their cudf +# counterparts. +_python_cudf_operator_map = { + # Binary operators + ast.Add: ASTOperator.ADD, + ast.Sub: ASTOperator.SUB, + ast.Mult: ASTOperator.MUL, + ast.Div: ASTOperator.DIV, + ast.FloorDiv: ASTOperator.FLOOR_DIV, + ast.Mod: ASTOperator.PYMOD, + ast.Pow: ASTOperator.POW, + ast.Eq: ASTOperator.EQUAL, + ast.NotEq: ASTOperator.NOT_EQUAL, + ast.Lt: ASTOperator.LESS, + ast.Gt: ASTOperator.GREATER, + ast.LtE: ASTOperator.LESS_EQUAL, + ast.GtE: ASTOperator.GREATER_EQUAL, + ast.BitXor: ASTOperator.BITWISE_XOR, + # TODO: The mapping of logical/bitwise operators here is inconsistent with + # pandas. In pandas, Both `BitAnd` and `And` map to + # `ASTOperator.LOGICAL_AND` for booleans, while they map to + # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to + # encode this at present because expressions can be arbitrarily nested so + # we won't know the dtype of the input without inserting a much more + # complex traversal of the expression tree to determine the output types at + # each node. For now, we'll rely on users to use the appropriate operator. + ast.BitAnd: ASTOperator.BITWISE_AND, + ast.BitOr: ASTOperator.BITWISE_OR, + ast.And: ASTOperator.LOGICAL_AND, + ast.Or: ASTOperator.LOGICAL_OR, + # Unary operators + ast.Invert: ASTOperator.BIT_INVERT, + ast.Not: ASTOperator.NOT, + # TODO: Missing USub, possibility other unary ops? +} + + +# Mapping between Python function names encode in an ast.Call node and the +# corresponding libcudf C++ AST operators. +_python_cudf_function_map = { + # TODO: Operators listed on + # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval # noqa: E501 + # that we don't support yet: + # expm1, log1p, arctan2 and log10. + "isnull": ASTOperator.IS_NULL, + "isna": ASTOperator.IS_NULL, + "sin": ASTOperator.SIN, + "cos": ASTOperator.COS, + "tan": ASTOperator.TAN, + "arcsin": ASTOperator.ARCSIN, + "arccos": ASTOperator.ARCCOS, + "arctan": ASTOperator.ARCTAN, + "sinh": ASTOperator.SINH, + "cosh": ASTOperator.COSH, + "tanh": ASTOperator.TANH, + "arcsinh": ASTOperator.ARCSINH, + "arccosh": ASTOperator.ARCCOSH, + "arctanh": ASTOperator.ARCTANH, + "exp": ASTOperator.EXP, + "log": ASTOperator.LOG, + "sqrt": ASTOperator.SQRT, + "abs": ASTOperator.ABS, + "ceil": ASTOperator.CEIL, + "floor": ASTOperator.FLOOR, + # TODO: Operators supported by libcudf with no Python function analog. + # ast.rint: ASTOperator.RINT, + # ast.cbrt: ASTOperator.CBRT, +} + + +class ExpressionTransformer(ast.NodeVisitor): + """A NodeVisitor specialized for constructing a libcudf expression tree. + + This visitor is designed to handle AST nodes that have libcudf equivalents. + It constructs column references from names and literals from constants, + then builds up operations. The resulting expression is returned by the + `visit` method + + Parameters + ---------- + column_mapping : dict[str, ColumnNameReference | ColumnReference] + Mapping from names to column references or column name references. + The former can be used for `compute_column` the latter in IO filters. + """ + + def __init__(self, dict column_mapping): + self.column_mapping = column_mapping + + def generic_visit(self, node): + raise ValueError( + f"Not expecting expression to have node of type {node.__class__.__name__}" + ) + + def visit_Module(self, node): + try: + expr, = node.body + except ValueError: + raise ValueError( + f"Expecting exactly one expression, not {len(node.body)}" + ) + return self.visit(expr) + + def visit_Expr(self, node): + return self.visit(node.value) + + def visit_Name(self, node): + try: + return self.column_mapping[node.id] + except KeyError: + raise ValueError(f"Unknown column name {node.id}") + + def visit_Constant(self, node): + if not isinstance(node.value, (float, int, str, complex)): + raise ValueError( + f"Unsupported literal {repr(node.value)} of type " + "{type(node.value).__name__}" + ) + return Literal(from_arrow(pa.scalar(node.value))) + + def visit_UnaryOp(self, node): + operand = self.visit(node.operand) + if isinstance(node.op, ast.USub): + # TODO: Except for leaf nodes, we won't know the type of the + # operand, so there's no way to know whether this should be a float + # or an int. We should maybe see what Spark does, and this will + # probably require casting. + minus_one = Literal(from_arrow(pa.scalar(-1))) + return Operation(ASTOperator.MUL, minus_one, operand) + elif isinstance(node.op, ast.UAdd): + return operand + else: + op = _python_cudf_operator_map[type(node.op)] + return Operation(op, operand) + + def visit_BinOp(self, node): + left = self.visit(node.left) + right = self.visit(node.right) + op = _python_cudf_operator_map[type(node.op)] + return Operation(op, left, right) + + def visit_BoolOp(self, node): + return functools.reduce( + functools.partial(Operation, ASTOperator.LOGICAL_AND), + ( + Operation( + _python_cudf_operator_map[type(node.op)], + self.visit(left), + self.visit(right), + ) + for left, right in zip( + node.values[:-1], node.values[1:], strict=True + ) + ) + ) + + def visit_Compare(self, node): + operands = [node.left, *node.comparators] + return functools.reduce( + functools.partial(Operation, ASTOperator.LOGICAL_AND), + ( + Operation( + _python_cudf_operator_map[type(op)], + self.visit(left), + self.visit(right), + ) + for op, left, right in zip( + node.ops, operands[:-1], operands[1:], strict=True + ) + ) + ) + + def visit_Call(self, node): + try: + op = _python_cudf_function_map[node.func.id] + except KeyError: + raise ValueError(f"Unsupported function {node.func}.") + # Assuming only unary functions are supported, which is checked above. + if len(node.args) != 1 or node.keywords: + raise ValueError( + f"Function {node.func} only accepts one positional " + "argument." + ) + return Operation(op, self.visit(node.args[0])) + + +@functools.lru_cache(256) +def to_expression(str expr, tuple column_names): + """ + Create an expression for `pylibcudf.transform.compute_column`. + + Parameters + ---------- + expr : str + The expression to evaluate. In (restricted) Python syntax. + column_names : tuple[str] + Ordered tuple of names. When calling `compute_column` on the resulting + expression, the provided table must have columns in the same order + as given here. + + Notes + ----- + This function keeps a small cache of recently used expressions. + + Returns + ------- + Expression + Expression for the given expr and col_names + """ + visitor = ExpressionTransformer( + {name: ColumnReference(i) for i, name in enumerate(column_names)} + ) + return visitor.visit(ast.parse(expr)) diff --git a/python/pylibcudf/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd index b9345f8cd42..56aef086e1b 100644 --- a/python/pylibcudf/pylibcudf/filling.pxd +++ b/python/pylibcudf/pylibcudf/filling.pxd @@ -33,3 +33,9 @@ cpdef Table repeat( Table input_table, ColumnOrSize count ) + +cpdef Column calendrical_month_sequence( + size_type n, + Scalar init, + size_type months, +) diff --git a/python/pylibcudf/pylibcudf/filling.pyi b/python/pylibcudf/pylibcudf/filling.pyi new file mode 100644 index 00000000000..0b5e29bdc32 --- /dev/null +++ b/python/pylibcudf/pylibcudf/filling.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +def fill( + destination: Column, begin: int, end: int, value: Scalar +) -> Column: ... +def fill_in_place( + destination: Column, begin: int, end: int, value: Scalar +) -> None: ... +def sequence(size: int, init: Scalar, step: Scalar) -> Column: ... +def repeat(input_table: Table, count: Column | int) -> Table: ... +def calendrical_month_sequence( + n: int, init: Scalar, months: int +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index 61b430e64aa..ea5b45ff7c2 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.filling cimport ( fill_in_place as cpp_fill_in_place, repeat as cpp_repeat, sequence as cpp_sequence, + calendrical_month_sequence as cpp_calendrical_month_sequence ) from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type @@ -18,6 +19,14 @@ from .scalar cimport Scalar from .table cimport Table +__all__ = [ + "fill", + "fill_in_place", + "repeat", + "sequence", + "calendrical_month_sequence", +] + cpdef Column fill( Column destination, size_type begin, @@ -48,13 +57,11 @@ cpdef Column fill( cdef unique_ptr[column] result with nogil: - result = move( - cpp_fill( - destination.view(), - begin, - end, - dereference(( value).c_obj) - ) + result = cpp_fill( + destination.view(), + begin, + end, + dereference(( value).c_obj) ) return Column.from_libcudf(move(result)) @@ -79,6 +86,10 @@ cpdef void fill_in_place( The index at which to stop filling. value : Scalar The value to fill with. + + Returns + ------- + None """ with nogil: @@ -103,6 +114,7 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step): The initial value of the sequence step : Scalar The step of the sequence + Returns ------- pylibcudf.Column @@ -112,12 +124,10 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step): cdef unique_ptr[column] result cdef size_type c_size = size with nogil: - result = move( - cpp_sequence( - c_size, - dereference(init.c_obj), - dereference(step.c_obj), - ) + result = cpp_sequence( + c_size, + dereference(init.c_obj), + dereference(step.c_obj), ) return Column.from_libcudf(move(result)) @@ -152,18 +162,50 @@ cpdef Table repeat( if ColumnOrSize is Column: with nogil: - result = move( - cpp_repeat( - input_table.view(), - count.view() - ) + result = cpp_repeat( + input_table.view(), + count.view() ) if ColumnOrSize is size_type: with nogil: - result = move( - cpp_repeat( - input_table.view(), - count - ) + result = cpp_repeat( + input_table.view(), + count ) return Table.from_libcudf(move(result)) + + +cpdef Column calendrical_month_sequence( + size_type n, + Scalar init, + size_type months, +): + + """Fill destination column from begin to end with value. + + For details, see :cpp:func:`calendrical_month_sequence`. + + Parameters + ---------- + n : size_type + Number of timestamps to generate + init : Scalar + The initial timestamp + months : size_type + Months to increment + + Returns + ------- + pylibcudf.Column + Timestamps column with sequences of months + """ + + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_calendrical_month_sequence( + n, + dereference(init.c_obj), + months + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyi b/python/pylibcudf/pylibcudf/gpumemoryview.pyi new file mode 100644 index 00000000000..50f1f39a515 --- /dev/null +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Mapping +from typing import Any + +class gpumemoryview: + def __init__(self, data: Any): ... + @property + def __cuda_array_interface__(self) -> Mapping[str, Any]: ... diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx index 0904022a944..41316eddb60 100644 --- a/python/pylibcudf/pylibcudf/gpumemoryview.pyx +++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +__all__ = ["gpumemoryview"] cdef class gpumemoryview: """Minimal representation of a memory buffer. @@ -25,3 +26,5 @@ cdef class gpumemoryview: @property def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ + + __hash__ = None diff --git a/python/pylibcudf/pylibcudf/groupby.pyi b/python/pylibcudf/pylibcudf/groupby.pyi new file mode 100644 index 00000000000..883ad6e34cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/groupby.pyi @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column +from pylibcudf.replace import ReplacePolicy +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, NullPolicy, Order, Sorted + +class GroupByRequest: + def __init__( + self, values: Column, aggregations: list[Aggregation] + ) -> None: ... + +class GroupBy: + def __init__( + self, + keys: Table, + null_handling: NullPolicy = NullPolicy.EXCLUDE, + keys_are_sorted: Sorted = Sorted.NO, + column_order: list[Order] | None = None, + null_precedence: list[NullOrder] | None = None, + ) -> None: ... + def aggregate( + self, requests: list[GroupByRequest] + ) -> tuple[Table, list[Table]]: ... + def scan( + self, requests: list[GroupByRequest] + ) -> tuple[Table, list[Table]]: ... + def shift( + self, values: Table, offset: list[int], fill_values: list[Scalar] + ) -> tuple[Table, Table]: ... + def replace_nulls( + self, value: Table, replace_policies: list[ReplacePolicy] + ) -> tuple[Table, Table]: ... + def get_groups( + self, values: Table | None = None + ) -> tuple[list[int], Table, Table]: ... diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index afb95dba5b3..e6cb3ac81a7 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -25,6 +25,8 @@ from .types cimport null_order, null_policy, order, sorted from .utils cimport _as_vector +__all__ = ["GroupBy", "GroupByRequest"] + cdef class GroupByRequest: """A request for a groupby aggregation or scan. @@ -45,6 +47,8 @@ cdef class GroupByRequest: self._values = values self._aggregations = aggregations + __hash__ = None + cdef aggregation_request _to_libcudf_agg_request(self) except *: """Convert to a libcudf aggregation_request object. @@ -127,6 +131,8 @@ cdef class GroupBy: # deallocated from under us: self._keys = keys + __hash__ = None + @staticmethod cdef tuple _parse_outputs( pair[unique_ptr[table], vector[aggregation_result]] c_res @@ -176,7 +182,7 @@ cdef class GroupBy: # We rely on libcudf to tell us this rather than checking the types beforehand # ourselves. with nogil: - c_res = move(dereference(self.c_obj).aggregate(c_requests)) + c_res = dereference(self.c_obj).aggregate(c_requests) return GroupBy._parse_outputs(move(c_res)) cpdef tuple scan(self, list requests): @@ -205,7 +211,7 @@ cdef class GroupBy: cdef pair[unique_ptr[table], vector[aggregation_result]] c_res with nogil: - c_res = move(dereference(self.c_obj).scan(c_requests)) + c_res = dereference(self.c_obj).scan(c_requests) return GroupBy._parse_outputs(move(c_res)) cpdef tuple shift(self, Table values, list offset, list fill_values): @@ -234,10 +240,11 @@ cdef class GroupBy: cdef vector[size_type] c_offset = offset cdef pair[unique_ptr[table], unique_ptr[table]] c_res with nogil: - c_res = move( - dereference(self.c_obj).shift(values.view(), c_offset, c_fill_values) + c_res = dereference(self.c_obj).shift( + values.view(), + c_offset, + c_fill_values ) - return ( Table.from_libcudf(move(c_res.first)), Table.from_libcudf(move(c_res.second)), @@ -264,10 +271,10 @@ cdef class GroupBy: cdef pair[unique_ptr[table], unique_ptr[table]] c_res cdef vector[replace_policy] c_replace_policies = replace_policies with nogil: - c_res = move( - dereference(self.c_obj).replace_nulls(value.view(), c_replace_policies) + c_res = dereference(self.c_obj).replace_nulls( + value.view(), + c_replace_policies ) - return ( Table.from_libcudf(move(c_res.first)), Table.from_libcudf(move(c_res.second)), diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd new file mode 100644 index 00000000000..2d070ddda69 --- /dev/null +++ b/python/pylibcudf/pylibcudf/hashing.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t + +from .column cimport Column +from .table cimport Table + + +cpdef Column murmurhash3_x86_32( + Table input, + uint32_t seed=* +) + +cpdef Table murmurhash3_x64_128( + Table input, + uint64_t seed=* +) + + +cpdef Column xxhash_64( + Table input, + uint64_t seed=* +) + +cpdef Column md5(Table input) +cpdef Column sha1(Table input) +cpdef Column sha224(Table input) +cpdef Column sha256(Table input) +cpdef Column sha384(Table input) +cpdef Column sha512(Table input) diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi new file mode 100644 index 00000000000..a849f5d0729 --- /dev/null +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Final + +from pylibcudf.column import Column +from pylibcudf.table import Table + +LIBCUDF_DEFAULT_HASH_SEED: Final[int] + +def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... +def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... +def xxhash_64(input: Table, seed: int = ...) -> Column: ... +def md5(input: Table) -> Column: ... +def sha1(input: Table) -> Column: ... +def sha224(input: Table) -> Column: ... +def sha256(input: Table) -> Column: ... +def sha384(input: Table) -> Column: ... +def sha512(input: Table) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx new file mode 100644 index 00000000000..548cffc0ce8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -0,0 +1,253 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.hash cimport ( + DEFAULT_HASH_SEED, + md5 as cpp_md5, + murmurhash3_x64_128 as cpp_murmurhash3_x64_128, + murmurhash3_x86_32 as cpp_murmurhash3_x86_32, + sha1 as cpp_sha1, + sha224 as cpp_sha224, + sha256 as cpp_sha256, + sha384 as cpp_sha384, + sha512 as cpp_sha512, + xxhash_64 as cpp_xxhash_64, +) +from pylibcudf.libcudf.table.table cimport table + +from .column cimport Column +from .table cimport Table + +__all__ = [ + "LIBCUDF_DEFAULT_HASH_SEED", + "md5", + "murmurhash3_x64_128", + "murmurhash3_x86_32", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "xxhash_64", +] + +LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED + +cpdef Column murmurhash3_x86_32( + Table input, + uint32_t seed=DEFAULT_HASH_SEED +): + """Computes the MurmurHash3 32-bit hash value of each row in the given table. + + For details, see :cpp:func:`murmurhash3_x86_32`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_murmurhash3_x86_32( + input.view(), + seed + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table murmurhash3_x64_128( + Table input, + uint64_t seed=DEFAULT_HASH_SEED +): + """Computes the MurmurHash3 64-bit hash value of each row in the given table. + + For details, see :cpp:func:`murmurhash3_x64_128`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint64_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Table + A table of two UINT64 columns + """ + cdef unique_ptr[table] c_result + with nogil: + c_result = cpp_murmurhash3_x64_128( + input.view(), + seed + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Column xxhash_64( + Table input, + uint64_t seed=DEFAULT_HASH_SEED +): + """Computes the xxHash 64-bit hash value of each row in the given table. + + For details, see :cpp:func:`xxhash_64`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint64_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_xxhash_64( + input.view(), + seed + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column md5(Table input): + """Computes the MD5 hash value of each row in the given table. + + For details, see :cpp:func:`md5`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the md5 hash of a row from the input + + """ + + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_md5(input.view()) + return Column.from_libcudf(move(c_result)) + +cpdef Column sha1(Table input): + """Computes the SHA-1 hash value of each row in the given table. + + For details, see :cpp:func:`sha1`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha1(input.view()) + return Column.from_libcudf(move(c_result)) + + +cpdef Column sha224(Table input): + """Computes the SHA-224 hash value of each row in the given table. + + For details, see :cpp:func:`sha224`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha224(input.view()) + return Column.from_libcudf(move(c_result)) + + +cpdef Column sha256(Table input): + """Computes the SHA-256 hash value of each row in the given table. + + For details, see :cpp:func:`sha256`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha256(input.view()) + return Column.from_libcudf(move(c_result)) + + +cpdef Column sha384(Table input): + """Computes the SHA-384 hash value of each row in the given table. + + For details, see :cpp:func:`sha384`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha384(input.view()) + return Column.from_libcudf(move(c_result)) + + +cpdef Column sha512(Table input): + """Computes the SHA-512 hash value of each row in the given table. + + For details, see :cpp:func:`sha512`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha512(input.view()) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd new file mode 100644 index 00000000000..2a0a8c15fdd --- /dev/null +++ b/python/pylibcudf/pylibcudf/interop.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table cimport Table + + +cpdef Table from_dlpack(object managed_tensor) + +cpdef object to_dlpack(Table input) diff --git a/python/pylibcudf/pylibcudf/interop.pyi b/python/pylibcudf/pylibcudf/interop.pyi new file mode 100644 index 00000000000..63de816010b --- /dev/null +++ b/python/pylibcudf/pylibcudf/interop.pyi @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass +from typing import Any, overload + +import pyarrow as pa + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import DataType + +@dataclass +class ColumnMetadata: + name: str = ... + children_meta: list[ColumnMetadata] = ... + +@overload +def from_arrow(obj: pa.DataType) -> DataType: ... +@overload +def from_arrow( + obj: pa.Scalar[Any], *, data_type: DataType | None = None +) -> Scalar: ... +@overload +def from_arrow(obj: pa.Array[Any]) -> Column: ... +@overload +def from_arrow(obj: pa.Table) -> Table: ... +@overload +def to_arrow( + obj: DataType, + *, + precision: int | None = None, + fields: Iterable[pa.Field[pa.DataType] | tuple[str, pa.DataType]] + | Mapping[str, pa.DataType] + | None = None, + value_type: pa.DataType | None = None, +) -> pa.DataType: ... +@overload +def to_arrow( + obj: Table, metadata: list[ColumnMetadata | str] | None = None +) -> pa.Table: ... +@overload +def to_arrow( + obj: Column, metadata: ColumnMetadata | str | None = None +) -> pa.Array[Any]: ... +@overload +def to_arrow( + obj: Scalar, metadata: ColumnMetadata | str | None = None +) -> pa.Scalar[Any]: ... +def from_dlpack(managed_tensor: Any) -> Table: ... +def to_dlpack(input: Table) -> Any: ... diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index 1a03fa5b45b..bd5397ac328 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -1,6 +1,11 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New +from cpython.pycapsule cimport ( + PyCapsule_GetPointer, + PyCapsule_IsValid, + PyCapsule_New, + PyCapsule_SetName, +) from libc.stdlib cimport free from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport ( ArrowArray, ArrowArrayStream, ArrowSchema, + DLManagedTensor, column_metadata, from_arrow_column as cpp_from_arrow_column, from_arrow_stream as cpp_from_arrow_stream, + from_dlpack as cpp_from_dlpack, to_arrow_host_raw, to_arrow_schema_raw, + to_dlpack as cpp_to_dlpack, ) from pylibcudf.libcudf.table.table cimport table @@ -30,6 +38,14 @@ from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +__all__ = [ + "ColumnMetadata", + "from_arrow", + "from_dlpack", + "to_arrow", + "to_dlpack", +] + ARROW_TO_PYLIBCUDF_TYPES = { pa.int8(): type_id.INT8, pa.int16(): type_id.INT16, @@ -131,7 +147,7 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None): cdef unique_ptr[table] c_result with nogil: # The libcudf function here will release the stream. - c_result = move(cpp_from_arrow_stream(c_stream)) + c_result = cpp_from_arrow_stream(c_stream) return Table.from_libcudf(move(c_result)) @@ -166,7 +182,7 @@ def _from_arrow_column(pyarrow_object, *, DataType data_type=None): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_from_arrow_column(c_schema, c_array)) + c_result = cpp_from_arrow_column(c_schema, c_array) # The capsule destructors should release automatically for us, but we # choose to do it explicitly here for clarity. @@ -315,3 +331,87 @@ def _to_arrow_scalar(cudf_object, metadata=None): # Note that metadata for scalars is primarily important for preserving # information on nested types since names are otherwise irrelevant. return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0] + + +cpdef Table from_dlpack(object managed_tensor): + """ + Convert a DLPack DLTensor into a cudf table. + + For details, see :cpp:func:`cudf::from_dlpack` + + Parameters + ---------- + managed_tensor : PyCapsule + A 1D or 2D column-major (Fortran order) tensor. + + Returns + ------- + Table + Table with a copy of the tensor data. + """ + if not PyCapsule_IsValid(managed_tensor, "dltensor"): + raise ValueError("Invalid PyCapsule object") + cdef unique_ptr[table] c_result + cdef DLManagedTensor* dlpack_tensor = PyCapsule_GetPointer( + managed_tensor, "dltensor" + ) + if dlpack_tensor is NULL: + raise ValueError("PyCapsule object contained a NULL pointer") + PyCapsule_SetName(managed_tensor, "used_dltensor") + + # Note: A copy is always performed when converting the dlpack + # data to a libcudf table. We also delete the dlpack_tensor pointer + # as the pointer is not deleted by libcudf's from_dlpack function. + # TODO: https://github.com/rapidsai/cudf/issues/10874 + # TODO: https://github.com/rapidsai/cudf/issues/10849 + with nogil: + c_result = cpp_from_dlpack(dlpack_tensor) + + cdef Table result = Table.from_libcudf(move(c_result)) + dlpack_tensor.deleter(dlpack_tensor) + return result + + +cpdef object to_dlpack(Table input): + """ + Convert a cudf table into a DLPack DLTensor. + + For details, see :cpp:func:`cudf::to_dlpack` + + Parameters + ---------- + input : Table + A 1D or 2D column-major (Fortran order) tensor. + + Returns + ------- + PyCapsule + 1D or 2D DLPack tensor with a copy of the table data, or nullptr. + """ + for col in input._columns: + if col.null_count(): + raise ValueError( + "Cannot create a DLPack tensor with null values. " + "Input is required to have null count as zero." + ) + cdef DLManagedTensor *dlpack_tensor + + with nogil: + dlpack_tensor = cpp_to_dlpack(input.view()) + + return PyCapsule_New( + dlpack_tensor, + "dltensor", + dlmanaged_tensor_pycapsule_deleter + ) + + +cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: + if PyCapsule_IsValid(pycap_obj, "used_dltensor"): + # we do not call a used capsule's deleter + return + cdef DLManagedTensor* dlpack_tensor = PyCapsule_GetPointer( + pycap_obj, "dltensor" + ) + if dlpack_tensor is not NULL: + dlpack_tensor.deleter(dlpack_tensor) diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index 965724a47b1..664faef718f 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx - types.pyx +set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx + parquet_metadata.pyx text.pyx timezone.pyx types.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 1bcc0a3f963..663804e714d 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, orc, parquet, timezone, types +from . cimport ( + avro, + datasource, + json, + orc, + parquet, + parquet_metadata, + text, + timezone, + types, +) from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 2e4f215b12c..f913a400684 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,31 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, orc, parquet, timezone, types +from . import ( + avro, + csv, + datasource, + json, + orc, + parquet, + parquet_metadata, + text, + timezone, + types, +) from .types import SinkInfo, SourceInfo, TableWithMetadata + +__all__ = [ + "SinkInfo", + "SourceInfo", + "TableWithMetadata", + "avro", + "csv", + "datasource", + "json", + "orc", + "parquet", + "parquet_metadata", + "text", + "timezone", + "types", +] diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi new file mode 100644 index 00000000000..49c2f083702 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/avro.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.io.types import SourceInfo, TableWithMetadata + +__all__ = ["read_avro"] + +def read_avro( + source_info: SourceInfo, + columns: list[str] | None = None, + skip_rows: int = 0, + num_rows: int = -1, +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index 438b0ff1634..4271333511a 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.io.avro cimport ( ) from pylibcudf.libcudf.types cimport size_type +__all__ = ["read_avro"] + cpdef TableWithMetadata read_avro( SourceInfo source_info, @@ -45,7 +47,7 @@ cpdef TableWithMetadata read_avro( for col in columns: c_columns.push_back(str(col).encode()) - cdef avro_reader_options avro_opts = move( + cdef avro_reader_options avro_opts = ( avro_reader_options.builder(source_info.c_obj) .columns(c_columns) .skip_rows(skip_rows) diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd new file mode 100644 index 00000000000..f04edaa316a --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/csv.pxd @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.vector cimport vector +from libcpp.string cimport string +from libcpp cimport bool +from pylibcudf.libcudf.io.csv cimport ( + csv_writer_options, + csv_writer_options_builder, +) +from pylibcudf.libcudf.io.types cimport quote_style +from pylibcudf.io.types cimport SinkInfo +from pylibcudf.table cimport Table + +cdef class CsvWriterOptions: + cdef csv_writer_options c_obj + cdef Table table + cdef SinkInfo sink + + +cdef class CsvWriterOptionsBuilder: + cdef csv_writer_options_builder c_obj + cdef Table table + cdef SinkInfo sink + cpdef CsvWriterOptionsBuilder names(self, list names) + cpdef CsvWriterOptionsBuilder na_rep(self, str val) + cpdef CsvWriterOptionsBuilder include_header(self, bool val) + cpdef CsvWriterOptionsBuilder rows_per_chunk(self, int val) + cpdef CsvWriterOptionsBuilder line_terminator(self, str term) + cpdef CsvWriterOptionsBuilder inter_column_delimiter(self, str delim) + cpdef CsvWriterOptionsBuilder true_value(self, str val) + cpdef CsvWriterOptionsBuilder false_value(self, str val) + cpdef CsvWriterOptions build(self) + + +cpdef void write_csv(CsvWriterOptions options) diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi new file mode 100644 index 00000000000..583b66bc29c --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Mapping + +from pylibcudf.io.types import ( + CompressionType, + QuoteStyle, + SinkInfo, + SourceInfo, + TableWithMetadata, +) +from pylibcudf.table import Table +from pylibcudf.types import DataType + +def read_csv( + source_info: SourceInfo, + *, + compression: CompressionType = CompressionType.AUTO, + byte_range_offset: int = 0, + byte_range_size: int = 0, + col_names: list[str] | None = None, + prefix: str = "", + mangle_dupe_cols: bool = True, + usecols: list[int] | list[str] | None = None, + nrows: int = -1, + skiprows: int = 0, + skipfooter: int = 0, + header: int = 0, + lineterminator: str = "\n", + delimiter: str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + delim_whitespace: bool = False, + skipinitialspace: bool = False, + skip_blank_lines: bool = True, + quoting: QuoteStyle = QuoteStyle.MINIMAL, + quotechar: str = '"', + doublequote: bool = True, + parse_dates: list[str] | list[int] | None = None, + parse_hex: list[str] | list[int] | None = None, + # Technically this should be dict/list + # but using a fused type prevents using None as default + dtypes: Mapping[str, DataType] | list[DataType] | None = None, + true_values: list[str] | None = None, + false_values: list[str] | None = None, + na_values: list[str] | None = None, + keep_default_na: bool = True, + na_filter: bool = True, + dayfirst: bool = False, + # Note: These options are supported by the libcudf reader + # but are not exposed here since there is no demand for them + # on the Python side yet. + # detect_whitespace_around_quotes: bool = False, + # timestamp_type: DataType = DataType(type_id.EMPTY), +) -> TableWithMetadata: ... +def write_csv(options: CsvWriterOptionsBuilder) -> None: ... + +class CsvWriterOptions: + def __init__(self): ... + @staticmethod + def builder(sink: SinkInfo, table: Table) -> CsvWriterOptionsBuilder: ... + +class CsvWriterOptionsBuilder: + def __init__(self): ... + def names(self, names: list) -> CsvWriterOptionsBuilder: ... + def na_rep(self, val: str) -> CsvWriterOptionsBuilder: ... + def include_header(self, val: bool) -> CsvWriterOptionsBuilder: ... + def rows_per_chunk(self, val: int) -> CsvWriterOptionsBuilder: ... + def line_terminator(self, term: str) -> CsvWriterOptionsBuilder: ... + def inter_column_delimiter( + self, delim: str + ) -> CsvWriterOptionsBuilder: ... + def true_value(self, val: str) -> CsvWriterOptionsBuilder: ... + def false_value(self, val: str) -> CsvWriterOptionsBuilder: ... + def build(self) -> CsvWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index b53d6771cd6..8be391de2c2 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -2,14 +2,18 @@ from libcpp cimport bool from libcpp.map cimport map + from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.io.types cimport SourceInfo, SinkInfo, TableWithMetadata from pylibcudf.libcudf.io.csv cimport ( csv_reader_options, + csv_writer_options, read_csv as cpp_read_csv, + write_csv as cpp_write_csv, ) + from pylibcudf.libcudf.io.types cimport ( compression_type, quote_style, @@ -17,7 +21,14 @@ from pylibcudf.libcudf.io.types cimport ( ) from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType +from pylibcudf.table cimport Table +__all__ = [ + "read_csv", + "write_csv", + "CsvWriterOptions", + "CsvWriterOptionsBuilder", +] cdef tuple _process_parse_dates_hex(list cols): cdef vector[string] str_cols @@ -80,6 +91,8 @@ def read_csv( ): """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:func:`read_csv`. + Parameters ---------- source_info : SourceInfo @@ -168,7 +181,7 @@ def read_csv( cdef vector[data_type] c_dtypes_list cdef map[string, data_type] c_dtypes_map - cdef csv_reader_options options = move( + cdef csv_reader_options options = ( csv_reader_options.builder(source_info.c_obj) .compression(compression) .mangle_dupe_cols(mangle_dupe_cols) @@ -261,3 +274,202 @@ def read_csv( c_result = move(cpp_read_csv(options)) return TableWithMetadata.from_libcudf(c_result) + + +# TODO: Implement the remaining methods +cdef class CsvWriterOptions: + """The settings to use for ``write_csv`` + + For details, see :cpp:class:`cudf::io::csv_writer_options` + """ + @staticmethod + def builder(SinkInfo sink, Table table): + """Create a CsvWriterOptionsBuilder object + + For details, see :cpp:func:`cudf::io::csv_writer_options::builder` + + Parameters + ---------- + sink : SinkInfo + The sink used for writer output + table : Table + Table to be written to output + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + cdef CsvWriterOptionsBuilder csv_builder = CsvWriterOptionsBuilder.__new__( + CsvWriterOptionsBuilder + ) + csv_builder.c_obj = csv_writer_options.builder(sink.c_obj, table.view()) + csv_builder.table = table + csv_builder.sink = sink + return csv_builder + + +# TODO: Implement the remaining methods +cdef class CsvWriterOptionsBuilder: + """Builder to build options for ``write_csv`` + + For details, see :cpp:class:`cudf::io::csv_writer_options_builder` + """ + cpdef CsvWriterOptionsBuilder names(self, list names): + """Sets optional column names. + + Parameters + ---------- + names : list[str] + Column names + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + self.c_obj.names([name.encode() for name in names]) + return self + + cpdef CsvWriterOptionsBuilder na_rep(self, str val): + """Sets string to used for null entries. + + Parameters + ---------- + val : str + String to represent null value + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + self.c_obj.na_rep(val.encode()) + return self + + cpdef CsvWriterOptionsBuilder include_header(self, bool val): + """Enables/Disables headers being written to csv. + + Parameters + ---------- + val : bool + Boolean value to enable/disable + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + self.c_obj.include_header(val) + return self + + cpdef CsvWriterOptionsBuilder rows_per_chunk(self, int val): + """Sets maximum number of rows to process for each file write. + + Parameters + ---------- + val : int + Number of rows per chunk + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + self.c_obj.rows_per_chunk(val) + return self + + cpdef CsvWriterOptionsBuilder line_terminator(self, str term): + """Sets character used for separating lines. + + Parameters + ---------- + term : str + Character to represent line termination + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + self.c_obj.line_terminator(term.encode()) + return self + + cpdef CsvWriterOptionsBuilder inter_column_delimiter(self, str delim): + """Sets character used for separating column values. + + Parameters + ---------- + delim : str + Character to delimit column values + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + self.c_obj.inter_column_delimiter(ord(delim)) + return self + + cpdef CsvWriterOptionsBuilder true_value(self, str val): + """Sets string used for values != 0 + + Parameters + ---------- + val : str + String to represent values != 0 + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + self.c_obj.true_value(val.encode()) + return self + + cpdef CsvWriterOptionsBuilder false_value(self, str val): + """Sets string used for values == 0 + + Parameters + ---------- + val : str + String to represent values == 0 + + Returns + ------- + CsvWriterOptionsBuilder + Builder to build CsvWriterOptions + """ + self.c_obj.false_value(val.encode()) + return self + + cpdef CsvWriterOptions build(self): + """Create a CsvWriterOptions object""" + cdef CsvWriterOptions csv_options = CsvWriterOptions.__new__( + CsvWriterOptions + ) + csv_options.c_obj = move(self.c_obj.build()) + csv_options.table = self.table + csv_options.sink = self.sink + return csv_options + + +cpdef void write_csv( + CsvWriterOptions options +): + """ + Write to CSV format. + + The table to write, output paths, and options are encapsulated + by the `options` object. + + For details, see :cpp:func:`write_csv`. + + Parameters + ---------- + options: CsvWriterOptions + Settings for controlling writing behavior + """ + + with nogil: + cpp_write_csv(move(options.c_obj)) diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyi b/python/pylibcudf/pylibcudf/io/datasource.pyi new file mode 100644 index 00000000000..e52197f793b --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/datasource.pyi @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +class Datasource: + def __init__(self): ... diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx index 02418444caa..aac1c0d1014 100644 --- a/python/pylibcudf/pylibcudf/io/datasource.pyx +++ b/python/pylibcudf/pylibcudf/io/datasource.pyx @@ -2,8 +2,10 @@ from pylibcudf.libcudf.io.datasource cimport datasource +__all__ = ["Datasource"] cdef class Datasource: + __hash__ = None cdef datasource* get_datasource(self) except * nogil: with gil: raise NotImplementedError("get_datasource() should not " diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi new file mode 100644 index 00000000000..b2bc6a43700 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from typing import TypeAlias + +from pylibcudf.column import Column +from pylibcudf.io.types import ( + CompressionType, + JSONRecoveryMode, + SinkInfo, + SourceInfo, + TableWithMetadata, +) +from pylibcudf.types import DataType + +ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap] + +NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]] + +def read_json( + source_info: SourceInfo, + dtypes: list[NameAndType] | None = None, + compression: CompressionType = CompressionType.AUTO, + lines: bool = False, + byte_range_offset: int = 0, + byte_range_size: int = 0, + keep_quotes: bool = False, + mixed_types_as_string: bool = False, + prune_columns: bool = False, + recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, +) -> TableWithMetadata: ... +def write_json( + sink_info: SinkInfo, + table_w_meta: TableWithMetadata, + na_rep: str = "", + include_nulls: bool = False, + lines: bool = False, + rows_per_chunk: int = 2**32 - 1, + true_value: str = "true", + false_value: str = "false", +) -> None: ... +def chunked_read_json( + source_info: SourceInfo, + dtypes: list[NameAndType] | None = None, + compression: CompressionType = CompressionType.AUTO, + keep_quotes: bool = False, + mixed_types_as_string: bool = False, + prune_columns: bool = False, + recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, + chunk_size: int = 100_000_000, +) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ... diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 29e49083bc6..ad2989925c9 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -23,6 +23,7 @@ from pylibcudf.libcudf.io.types cimport ( from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType +__all__ = ["chunked_read_json", "read_json", "write_json"] cdef map[string, schema_element] _generate_schema_map(list dtypes): cdef map[string, schema_element] schema_map @@ -59,7 +60,7 @@ cdef json_reader_options _setup_json_reader_options( json_recovery_mode_t recovery_mode): cdef vector[data_type] types_vec - cdef json_reader_options opts = move( + cdef json_reader_options opts = ( json_reader_options.builder(source_info.c_obj) .compression(compression) .lines(lines) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi new file mode 100644 index 00000000000..4cf87f1a832 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import Any + +from pylibcudf.io.types import SourceInfo, TableWithMetadata +from pylibcudf.types import DataType + +def read_orc( + source_info: SourceInfo, + columns: list[str] | None = None, + stripes: list[list[int]] | None = None, + skip_rows: int = 0, + nrows: int = -1, + use_index: bool = True, + use_np_dtypes: bool = True, + timestamp_type: DataType | None = None, + decimal128_columns: list[str] | None = None, +) -> TableWithMetadata: ... + +class OrcColumnStatistics: + def __init__(self): ... + @property + def number_of_values(self) -> int | None: ... + @property + def has_null(self) -> bool | None: ... + def __getitem__(self, item: str) -> Any: ... + def __contains__(self, item: str) -> bool: ... + def get[T](self, item: str, default: None | T = None) -> T | None: ... + +class ParsedOrcStatistics: + def __init__(self): ... + @property + def column_names(self) -> list[str]: ... + @property + def file_stats(self) -> list[OrcColumnStatistics]: ... + @property + def stripes_stats(self) -> list[OrcColumnStatistics]: ... + +def read_parsed_orc_statistics( + source_info: SourceInfo, +) -> ParsedOrcStatistics: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 01a5e4b04a1..4270f5b4f95 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -30,6 +30,12 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType from pylibcudf.variant cimport get_if, holds_alternative +__all__ = [ + "OrcColumnStatistics", + "ParsedOrcStatistics", + "read_orc", + "read_parsed_orc_statistics", +] cdef class OrcColumnStatistics: def __init__(self): @@ -39,6 +45,8 @@ cdef class OrcColumnStatistics: "use `OrcColumnStatistics.from_libcudf` instead." ) + __hash__ = None + @property def number_of_values(self): if self.number_of_values_c.has_value(): @@ -183,6 +191,8 @@ cdef class OrcColumnStatistics: cdef class ParsedOrcStatistics: + __hash__ = None + @property def column_names(self): return [name.decode() for name in self.c_obj.column_names] @@ -252,7 +262,7 @@ cpdef TableWithMetadata read_orc( """ cdef orc_reader_options opts cdef vector[vector[size_type]] c_stripes - opts = move( + opts = ( orc_reader_options.builder(source_info.c_obj) .use_index(use_index) .build() diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi new file mode 100644 index 00000000000..bcf1d1cce09 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.expressions import Expression +from pylibcudf.io.types import SourceInfo, TableWithMetadata + +class ChunkedParquetReader: + def __init__( + self, + source_info: SourceInfo, + columns: list[str] | None = None, + row_groups: list[list[int]] | None = None, + use_pandas_metadata: bool = True, + convert_strings_to_categories: bool = False, + skip_rows: int = 0, + nrows: int = 0, + chunk_read_limit: int = 0, + pass_read_limit: int = 1024000000, + allow_mismatched_pq_schemas: bool = False, + ) -> None: ... + def has_next(self) -> bool: ... + def read_chunk(self) -> TableWithMetadata: ... + +def read_parquet( + source_info: SourceInfo, + columns: list[str] | None = None, + row_groups: list[list[int]] | None = None, + filters: Expression | None = None, + convert_strings_to_categories: bool = False, + use_pandas_metadata: bool = True, + skip_rows: int = 0, + nrows: int = -1, + allow_mismatched_pq_schemas: bool = False, + # disabled see comment in parquet.pyx for more + # reader_column_schema: ReaderColumnSchema = *, + # timestamp_type: DataType = * +) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 981ca7b8159..b76a352d633 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -16,6 +16,8 @@ from pylibcudf.libcudf.io.parquet cimport ( from pylibcudf.libcudf.io.types cimport table_with_metadata from pylibcudf.libcudf.types cimport size_type +__all__ = ["ChunkedParquetReader", "read_parquet"] + cdef parquet_reader_options _setup_parquet_reader_options( SourceInfo source_info, @@ -123,6 +125,8 @@ cdef class ChunkedParquetReader: ) ) + __hash__ = None + cpdef bool has_next(self): """ Returns True if there is another chunk in the Parquet file diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd new file mode 100644 index 00000000000..e421a64adc8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pxd @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.io.types cimport SourceInfo +from pylibcudf.libcudf.io.parquet_metadata cimport( + parquet_metadata, + parquet_schema, + parquet_column_schema, +) + +cdef class ParquetColumnSchema: + cdef parquet_column_schema column_schema + + @staticmethod + cdef from_column_schema(parquet_column_schema column_schema) + + cpdef str name(self) + + cpdef int num_children(self) + + cpdef ParquetColumnSchema child(self, int idx) + + cpdef list children(self) + + +cdef class ParquetSchema: + cdef parquet_schema schema + + @staticmethod + cdef from_schema(parquet_schema schema) + + cpdef ParquetColumnSchema root(self) + + +cdef class ParquetMetadata: + cdef parquet_metadata meta + + @staticmethod + cdef from_metadata(parquet_metadata meta) + + cpdef ParquetSchema schema(self) + + cpdef int num_rows(self) + + cpdef int num_rowgroups(self) + + cpdef dict metadata(self) + + cpdef list rowgroup_metadata(self) + + +cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info) diff --git a/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx new file mode 100644 index 00000000000..0ad4dafb0cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/parquet_metadata.pyx @@ -0,0 +1,214 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.io.types cimport SourceInfo +from pylibcudf.libcudf.io cimport parquet_metadata as cpp_parquet_metadata + + +__all__ = [ + "ParquetColumnSchema", + "ParquetMetadata", + "ParquetSchema", + "read_parquet_metadata", +] + +cdef class ParquetColumnSchema: + """ + Schema of a parquet column, including the nested columns. + + Parameters + ---------- + parquet_column_schema + """ + def __init__(self): + raise ValueError("Construct ParquetColumnSchema with from_column_schema.") + + @staticmethod + cdef from_column_schema(cpp_parquet_metadata.parquet_column_schema column_schema): + cdef ParquetColumnSchema result = ParquetColumnSchema.__new__( + ParquetColumnSchema + ) + result.column_schema = column_schema + return result + + cpdef str name(self): + """ + Returns parquet column name; can be empty. + + Returns + ------- + str + Column name + """ + return self.column_schema.name().decode() + + cpdef int num_children(self): + """ + Returns the number of child columns. + + Returns + ------- + int + Children count + """ + return self.column_schema.num_children() + + cpdef ParquetColumnSchema child(self, int idx): + """ + Returns schema of the child with the given index. + + Parameters + ---------- + idx : int + Child Index + + Returns + ------- + ParquetColumnSchema + Child schema + """ + return ParquetColumnSchema.from_column_schema(self.column_schema.child(idx)) + + cpdef list children(self): + """ + Returns schemas of all child columns. + + Returns + ------- + list[ParquetColumnSchema] + Child schemas. + """ + cdef cpp_parquet_metadata.parquet_column_schema child + return [ + ParquetColumnSchema.from_column_schema(child) + for child in self.column_schema.children() + ] + + +cdef class ParquetSchema: + """ + Schema of a parquet file. + + Parameters + ---------- + parquet_schema + """ + + def __init__(self): + raise ValueError("Construct ParquetSchema with from_schema.") + + @staticmethod + cdef from_schema(cpp_parquet_metadata.parquet_schema schema): + cdef ParquetSchema result = ParquetSchema.__new__(ParquetSchema) + result.schema = schema + return result + + cpdef ParquetColumnSchema root(self): + """ + Returns the schema of the struct column that contains all columns as fields. + + Returns + ------- + ParquetColumnSchema + Root column schema + """ + return ParquetColumnSchema.from_column_schema(self.schema.root()) + + +cdef class ParquetMetadata: + """ + Information about content of a parquet file. + + Parameters + ---------- + parquet_metadata + """ + + def __init__(self): + raise ValueError("Construct ParquetMetadata with from_metadata.") + + @staticmethod + cdef from_metadata(cpp_parquet_metadata.parquet_metadata meta): + cdef ParquetMetadata result = ParquetMetadata.__new__(ParquetMetadata) + result.meta = meta + return result + + cpdef ParquetSchema schema(self): + """ + Returns the parquet schema. + + Returns + ------- + ParquetSchema + Parquet schema + """ + return ParquetSchema.from_schema(self.meta.schema()) + + cpdef int num_rows(self): + """ + Returns the number of rows of the root column. + + Returns + ------- + int + Number of rows + """ + return self.meta.num_rows() + + cpdef int num_rowgroups(self): + """ + Returns the number of rowgroups in the file. + + Returns + ------- + int + Number of row groups. + """ + return self.meta.num_rowgroups() + + cpdef dict metadata(self): + """ + Returns the key-value metadata in the file footer. + + Returns + ------- + dict[str, str] + Key value metadata as a map. + """ + return {key.decode(): val.decode() for key, val in self.meta.metadata()} + + cpdef list rowgroup_metadata(self): + """ + Returns the row group metadata in the file footer. + + Returns + ------- + list[dict[str, int]] + Vector of row group metadata as maps. + """ + return [ + {key.decode(): val for key, val in metadata} + for metadata in self.meta.rowgroup_metadata() + ] + + +cpdef ParquetMetadata read_parquet_metadata(SourceInfo src_info): + """ + Reads metadata of parquet dataset. + + Parameters + ---------- + src_info : SourceInfo + Dataset source. + + Returns + ------- + ParquetMetadata + Parquet_metadata with parquet schema, number of rows, + number of row groups and key-value metadata. + """ + cdef cpp_parquet_metadata.parquet_metadata c_result + + with nogil: + c_result = cpp_parquet_metadata.read_parquet_metadata(src_info.c_obj) + + return ParquetMetadata.from_metadata(c_result) diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd new file mode 100644 index 00000000000..051e9bc0cde --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source + +cdef class ParseOptions: + cdef parse_options c_options + +cdef class DataChunkSource: + cdef unique_ptr[data_chunk_source] c_source + cdef string data_ref + + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=* +) + +cpdef DataChunkSource make_source(str data) + +cpdef DataChunkSource make_source_from_file(str filename) + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=*, + int virtual_end=*, +) diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx new file mode 100644 index 00000000000..d3cbdc4cd60 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -0,0 +1,202 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libc.stdint cimport uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.io cimport text as cpp_text + +__all__ = [ + "DataChunkSource", + "ParseOptions", + "make_source", + "make_source_from_bgzip_file", + "make_source_from_file", + "multibyte_split", +] + +cdef class ParseOptions: + """ + Parsing options for `multibyte_split` + + Parameters + ---------- + byte_range : list | tuple, default None + Only rows starting inside this byte range will be + part of the output column. + + strip_delimiters : bool, default True + Whether delimiters at the end of rows should + be stripped from the output column. + """ + def __init__( + self, + *, + byte_range=None, + strip_delimiters=False, + ): + self.c_options = cpp_text.parse_options() + if byte_range is not None: + c_byte_range_offset = byte_range[0] + c_byte_range_size = byte_range[1] + self.c_options.byte_range = cpp_text.byte_range_info( + c_byte_range_offset, + c_byte_range_size + ) + self.c_options.strip_delimiters = strip_delimiters + + +cdef class DataChunkSource: + """ + Data source for `multibyte_split` + + Parameters + ---------- + data : str + Filename or data itself. + """ + + def __cinit__(self, str data): + # Need to keep a reference alive for make_source + self.data_ref = data.encode() + + +cpdef DataChunkSource make_source(str data): + """ + Creates a data source capable of producing device-buffered views + of the given string. + + Parameters + ---------- + data : str + The host data to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided host data. + """ + cdef DataChunkSource dcs = DataChunkSource(data) + with nogil: + dcs.c_source = move(cpp_text.make_source(dcs.data_ref)) + return dcs + + +cpdef DataChunkSource make_source_from_file(str filename): + """ + Creates a data source capable of producing device-buffered views of the file. + + Parameters + ---------- + filename : str + The filename of the file to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef DataChunkSource dcs = DataChunkSource(filename) + with nogil: + dcs.c_source = move(cpp_text.make_source_from_file(dcs.data_ref)) + return dcs + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=-1, + int virtual_end=-1, +): + """ + Creates a data source capable of producing device-buffered views of + a BGZIP compressed file with virtual record offsets. + + Parameters + ---------- + filename : str + The filename of the BGZIP-compressed file to be exposed as a data chunk source. + + virtual_begin : int + The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits + describe the offset into the compressed file, its lower 16 bits describe the + block-local offset. + + virtual_end : int, default None + The virtual (Tabix) offset one past the last byte to be read + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef uint64_t c_virtual_begin + cdef uint64_t c_virtual_end + cdef DataChunkSource dcs = DataChunkSource(filename) + + if virtual_begin == -1 and virtual_end == -1: + with nogil: + dcs.c_source = move(cpp_text.make_source_from_bgzip_file(dcs.data_ref)) + elif virtual_begin != -1 and virtual_end != -1: + c_virtual_begin = virtual_begin + c_virtual_end = virtual_end + with nogil: + dcs.c_source = move( + cpp_text.make_source_from_bgzip_file( + dcs.data_ref, + c_virtual_begin, + c_virtual_end, + ) + ) + else: + raise ValueError( + "virtual_begin and virtual_end must both be None or both be int" + ) + return dcs + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=None +): + """ + Splits the source text into a strings column using a multiple byte delimiter. + + For details, see :cpp:func:`cudf::io::text::multibyte_split` + + Parameters + ---------- + source : + The source string. + + delimiter : str + UTF-8 encoded string for which to find offsets in the source. + + options : ParseOptions + The parsing options to use (including byte range). + + Returns + ------- + Column + The strings found by splitting the source by the delimiter + within the relevant byte range. + """ + cdef unique_ptr[column] c_result + cdef unique_ptr[data_chunk_source] c_source = move(source.c_source) + cdef string c_delimiter = delimiter.encode() + + if options is None: + options = ParseOptions() + + cdef cpp_text.parse_options c_options = options.c_options + + with nogil: + c_result = cpp_text.multibyte_split( + dereference(c_source), + c_delimiter, + c_options + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyi b/python/pylibcudf/pylibcudf/io/timezone.pyi new file mode 100644 index 00000000000..0582800c4af --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/timezone.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table import Table + +def make_timezone_transition_table( + tzif_dir: str, timezone_name: str +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx index e02239d7252..af7cf8a4ee5 100644 --- a/python/pylibcudf/pylibcudf/io/timezone.pyx +++ b/python/pylibcudf/pylibcudf/io/timezone.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.table.table cimport table from ..table cimport Table +__all__ = ["make_timezone_transition_table"] cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): """ @@ -33,11 +34,9 @@ cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): cdef string c_tzname = timezone_name.encode() with nogil: - c_result = move( - cpp_make_timezone_transition_table( - make_optional[string](c_tzdir), - c_tzname - ) + c_result = cpp_make_timezone_transition_table( + make_optional[string](c_tzdir), + c_tzname ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi new file mode 100644 index 00000000000..a4f4fc13bdc --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -0,0 +1,97 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import io +import os +from collections.abc import Mapping +from enum import IntEnum +from typing import Any, Literal, TypeAlias, overload + +from pylibcudf.column import Column +from pylibcudf.io.datasource import Datasource +from pylibcudf.table import Table + +class JSONRecoveryMode(IntEnum): + FAIL = ... + RECOVER_WITH_NULL = ... + +class CompressionType(IntEnum): + NONE = ... + AUTO = ... + SNAPPY = ... + GZIP = ... + BZIP2 = ... + BROTLI = ... + ZIP = ... + XZ = ... + ZLIB = ... + LZ4 = ... + LZO = ... + ZSTD = ... + +class ColumnEncoding(IntEnum): + USE_DEFAULT = ... + DICTIONARY = ... + PLAIN = ... + DELTA_BINARY_PACKED = ... + DELTA_LENGTH_BYTE_ARRAY = ... + DELTA_BYTE_ARRAY = ... + BYTE_STREAM_SPLIT = ... + DIRECT = ... + DIRECT_V2 = ... + DICTIONARY_V2 = ... + +class DictionaryPolicy(IntEnum): + NEVER = ... + ADAPTIVE = ... + ALWAYS = ... + +class StatisticsFreq(IntEnum): + STATISTICS_NONE = ... + STATISTICS_ROWGROUP = ... + STATISTICS_PAGE = ... + STATISTICS_COLUMN = ... + +class QuoteStyle(IntEnum): + MINIMAL = ... + ALL = ... + NONNUMERIC = ... + NONE = ... + +ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]] +ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec] + +class TableWithMetadata: + tbl: Table + def __init__( + self, tbl: Table, column_names: list[ColumnNameSpec] + ) -> None: ... + @property + def columns(self) -> list[Column]: ... + @overload + def column_names(self, include_children: Literal[False]) -> list[str]: ... + @overload + def column_names( + self, include_children: Literal[True] + ) -> list[ColumnNameSpec]: ... + @overload + def column_names( + self, include_children: bool = False + ) -> list[str] | list[ColumnNameSpec]: ... + @property + def child_names(self) -> ChildNameSpec: ... + @property + def per_file_user_data(self) -> list[Mapping[str, str]]: ... + +class SourceInfo: + def __init__( + self, sources: list[str] | list[os.PathLike[Any]] | list[Datasource] + ) -> None: ... + +class SinkInfo: + def __init__( + self, + sinks: list[os.PathLike[Any]] + | list[io.StringIO] + | list[io.BytesIO] + | list[io.TextIOBase] + | list[str], + ) -> None: ... diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 563a02761da..51d5bda75c7 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -20,12 +20,29 @@ import codecs import errno import io import os +import re from pylibcudf.libcudf.io.json import \ json_recovery_mode_t as JSONRecoveryMode # no-cython-lint -from pylibcudf.libcudf.io.types import \ - compression_type as CompressionType # no-cython-lint +from pylibcudf.libcudf.io.types import ( + compression_type as CompressionType, # no-cython-lint + column_encoding as ColumnEncoding, # no-cython-lint + dictionary_policy as DictionaryPolicy, # no-cython-lint + quote_style as QuoteStyle, # no-cython-lint + statistics_freq as StatisticsFreq, # no-cython-lint +) +__all__ = [ + "ColumnEncoding", + "CompressionType", + "DictionaryPolicy", + "JSONRecoveryMode", + "QuoteStyle", + "SinkInfo", + "SourceInfo", + "StatisticsFreq", + "TableWithMetadata", +] cdef class TableWithMetadata: """A container holding a table and its associated metadata @@ -49,6 +66,8 @@ cdef class TableWithMetadata: self.metadata.schema_info = self._make_column_info(column_names) + __hash__ = None + cdef vector[column_name_info] _make_column_info(self, list column_names): cdef vector[column_name_info] col_name_infos cdef column_name_info info @@ -143,6 +162,8 @@ cdef class SourceInfo: Mixing different types of sources will raise a `ValueError`. """ + # Regular expression that match remote file paths supported by libcudf + _is_remote_file_pattern = re.compile(r"^s3://", re.IGNORECASE) def __init__(self, list sources): if not sources: @@ -157,11 +178,12 @@ cdef class SourceInfo: for src in sources: if not isinstance(src, (os.PathLike, str)): raise ValueError("All sources must be of the same type!") - if not os.path.isfile(src): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - src) - + if not (os.path.isfile(src) or self._is_remote_file_pattern.match(src)): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), src + ) + # TODO: Keep the sources alive (self.byte_sources = sources) + # for str data (e.g. read_json)? c_files.push_back( str(src).encode()) self.c_obj = move(source_info(c_files)) @@ -213,6 +235,8 @@ cdef class SourceInfo: self.c_obj = source_info(c_host_buffers) + __hash__ = None + # Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you # write from cudf to any python file-like object (File/BytesIO/SocketIO etc) @@ -237,18 +261,24 @@ cdef cppclass iobase_data_sink(data_sink): cdef class SinkInfo: - """A class containing details on a source to read from. + """ + A class containing details about destinations (sinks) to write data to. - For details, see :cpp:class:`cudf::io::sink_info`. + For more details, see :cpp:class:`cudf::io::sink_info`. Parameters ---------- - sinks : list of str, PathLike, BytesIO, StringIO + sinks : list of str, PathLike, or io.IOBase instances + A list of sinks to write data to. Each sink can be: - A homogeneous list of sinks (this can be a string filename, - bytes, or one of the Python I/O classes) to read from. + - A string representing a filename. + - A PathLike object. + - An instance of a Python I/O class that is a subclass of io.IOBase + (eg., io.BytesIO, io.StringIO). - Mixing different types of sinks will raise a `ValueError`. + The list must be homogeneous in type unless all sinks are instances + of subclasses of io.IOBase. Mixing different types of sinks + (that are not all io.IOBase instances) will raise a ValueError. """ def __init__(self, list sinks): @@ -256,32 +286,42 @@ cdef class SinkInfo: cdef vector[string] paths if not sinks: - raise ValueError("Need to pass at least one sink") + raise ValueError("At least one sink must be provided.") if isinstance(sinks[0], os.PathLike): sinks = [os.path.expanduser(s) for s in sinks] cdef object initial_sink_cls = type(sinks[0]) - if not all(isinstance(s, initial_sink_cls) for s in sinks): - raise ValueError("All sinks must be of the same type!") + if not all( + isinstance(s, initial_sink_cls) or ( + isinstance(sinks[0], io.IOBase) and isinstance(s, io.IOBase) + ) for s in sinks + ): + raise ValueError( + "All sinks must be of the same type unless they are all instances " + "of subclasses of io.IOBase." + ) - if initial_sink_cls in {io.StringIO, io.BytesIO, io.TextIOBase}: + if isinstance(sinks[0], io.IOBase): data_sinks.reserve(len(sinks)) - if isinstance(sinks[0], (io.StringIO, io.BytesIO)): - for s in sinks: + for s in sinks: + if isinstance(s, (io.StringIO, io.BytesIO)): self.sink_storage.push_back( unique_ptr[data_sink](new iobase_data_sink(s)) ) - elif isinstance(sinks[0], io.TextIOBase): - for s in sinks: - if codecs.lookup(s).name not in ('utf-8', 'ascii'): + elif isinstance(s, io.TextIOBase): + if codecs.lookup(s.encoding).name not in ('utf-8', 'ascii'): raise NotImplementedError(f"Unsupported encoding {s.encoding}") self.sink_storage.push_back( unique_ptr[data_sink](new iobase_data_sink(s.buffer)) ) - data_sinks.push_back(self.sink_storage.back().get()) - elif initial_sink_cls is str: + else: + self.sink_storage.push_back( + unique_ptr[data_sink](new iobase_data_sink(s)) + ) + data_sinks.push_back(self.sink_storage.back().get()) + elif isinstance(sinks[0], str): paths.reserve(len(sinks)) for s in sinks: paths.push_back( s.encode()) @@ -295,3 +335,5 @@ cdef class SinkInfo: else: # we don't have sinks so we must have paths to sinks self.c_obj = sink_info(paths) + + __hash__ = None diff --git a/python/pylibcudf/pylibcudf/join.pxd b/python/pylibcudf/pylibcudf/join.pxd index 06969b4a2db..bb9162b466a 100644 --- a/python/pylibcudf/pylibcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/join.pxd @@ -3,6 +3,7 @@ from pylibcudf.libcudf.types cimport null_equality from .column cimport Column +from .expressions cimport Expression from .table cimport Table @@ -37,3 +38,78 @@ cpdef Column left_anti_join( ) cpdef Table cross_join(Table left, Table right) + +cpdef tuple conditional_inner_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef tuple conditional_left_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef tuple conditional_full_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef Column conditional_left_semi_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef Column conditional_left_anti_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef tuple mixed_inner_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) + +cpdef tuple mixed_left_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) + +cpdef tuple mixed_full_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) + +cpdef Column mixed_left_semi_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) + +cpdef Column mixed_left_anti_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi new file mode 100644 index 00000000000..f34357baa67 --- /dev/null +++ b/python/pylibcudf/pylibcudf/join.pyi @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.expressions import Expression +from pylibcudf.table import Table +from pylibcudf.types import NullEquality + +def inner_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def left_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def full_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> tuple[Column, Column]: ... +def left_semi_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> Column: ... +def left_anti_join( + left_keys: Table, right_keys: Table, nulls_equal: NullEquality +) -> Column: ... +def cross_join(left: Table, right: Table) -> Table: ... +def conditional_inner_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_left_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_full_join( + left: Table, right: Table, binary_predicate: Expression +) -> tuple[Column, Column]: ... +def conditional_left_semi_join( + left: Table, right: Table, binary_predicate: Expression +) -> Column: ... +def conditional_left_anti_join( + left: Table, right: Table, binary_predicate: Expression +) -> Column: ... +def mixed_inner_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_left_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_full_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> tuple[Column, Column]: ... +def mixed_left_semi_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> Column: ... +def mixed_left_anti_join( + left_keys: Table, + right_keys: Table, + left_conditional: Table, + right_conditional: Table, + binary_predicate: Expression, + nulls_equal: NullEquality, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index 25664286f19..c2efe05ffc4 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -9,11 +9,30 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport null_equality -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer from .column cimport Column +from .expressions cimport Expression from .table cimport Table +__all__ = [ + "conditional_full_join", + "conditional_inner_join", + "conditional_left_anti_join", + "conditional_left_join", + "conditional_left_semi_join", + "cross_join", + "full_join", + "inner_join", + "left_anti_join", + "left_join", + "left_semi_join", + "mixed_full_join", + "mixed_inner_join", + "mixed_left_anti_join", + "mixed_left_join", + "mixed_left_semi_join", +] cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): # helper to convert a gather map to a Column @@ -212,5 +231,409 @@ cpdef Table cross_join(Table left, Table right): """ cdef unique_ptr[table] result with nogil: - result = move(cpp_join.cross_join(left.view(), right.view())) + result = cpp_join.cross_join(left.view(), right.view()) return Table.from_libcudf(move(result)) + + +cpdef tuple conditional_inner_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional inner join between two tables. + + For details, see :cpp:func:`conditional_inner_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.conditional_inner_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple conditional_left_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional left join between two tables. + + For details, see :cpp:func:`conditional_left_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.conditional_left_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple conditional_full_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional full join between two tables. + + For details, see :cpp:func:`conditional_full_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.conditional_full_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef Column conditional_left_semi_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional left semi join between two tables. + + For details, see :cpp:func:`conditional_left_semi_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.conditional_left_semi_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return _column_from_gather_map(move(c_result)) + + +cpdef Column conditional_left_anti_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional left anti join between two tables. + + For details, see :cpp:func:`conditional_left_anti_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.conditional_left_anti_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return _column_from_gather_map(move(c_result)) + + +cpdef tuple mixed_inner_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed inner join between two tables. + + For details, see :cpp:func:`mixed_inner_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.mixed_inner_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple mixed_left_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed left join between two tables. + + For details, see :cpp:func:`mixed_left_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.mixed_left_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple mixed_full_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed full join between two tables. + + For details, see :cpp:func:`mixed_full_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.mixed_full_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef Column mixed_left_semi_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed left semi join between two tables. + + For details, see :cpp:func:`mixed_left_semi_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.mixed_left_semi_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return _column_from_gather_map(move(c_result)) + + +cpdef Column mixed_left_anti_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed left anti join between two tables. + + For details, see :cpp:func:`mixed_left_anti_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.mixed_left_anti_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return _column_from_gather_map(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/json.pxd b/python/pylibcudf/pylibcudf/json.pxd new file mode 100644 index 00000000000..87a87349b8a --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.json cimport get_json_object_options +from pylibcudf.scalar cimport Scalar + + +cdef class GetJsonObjectOptions: + cdef get_json_object_options options + + +cpdef Column get_json_object( + Column col, + Scalar json_path, + GetJsonObjectOptions options=* +) diff --git a/python/pylibcudf/pylibcudf/json.pyi b/python/pylibcudf/pylibcudf/json.pyi new file mode 100644 index 00000000000..b93d4876dab --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class GetJsonObjectOptions: + def __init__( + self, + *, + allow_single_quotes: bool = False, + strip_quotes_from_single_strings: bool = True, + missing_fields_as_nulls: bool = False, + ) -> None: ... + def get_allow_single_quotes(self) -> bool: ... + def get_strip_quotes_from_single_strings(self) -> bool: ... + def get_missing_fields_as_nulls(self) -> bool: ... + def set_allow_single_quotes(self, val: bool) -> None: ... + def set_strip_quotes_from_single_strings(self, val: bool) -> None: ... + def set_missing_fields_as_nulls(self, val: bool) -> None: ... + +def get_json_object( + col: Column, json_path: Scalar, options: GetJsonObjectOptions | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx new file mode 100644 index 00000000000..5ec1e1be971 --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pyx @@ -0,0 +1,155 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf cimport json as cpp_json +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.scalar cimport Scalar + +__all__ = ["GetJsonObjectOptions", "get_json_object"] + +cdef class GetJsonObjectOptions: + """Settings for ``get_json_object()``""" + def __init__( + self, + *, + allow_single_quotes=False, + strip_quotes_from_single_strings=True, + missing_fields_as_nulls=False + ): + self.set_allow_single_quotes(allow_single_quotes) + self.set_strip_quotes_from_single_strings( + strip_quotes_from_single_strings + ) + self.set_missing_fields_as_nulls(missing_fields_as_nulls) + + __hash__ = None + + def get_allow_single_quotes(self): + """ + Returns true/false depending on whether single-quotes for representing strings + are allowed. + + Returns + ------- + bool + true if single-quotes are allowed, false otherwise. + """ + return self.options.get_allow_single_quotes() + + def get_strip_quotes_from_single_strings(self): + """ + Returns true/false depending on whether individually returned string values have + their quotes stripped. + + Returns + ------- + bool + true if individually returned string values have their quotes stripped. + """ + return self.options.get_strip_quotes_from_single_strings() + + def get_missing_fields_as_nulls(self): + """ + Whether a field not contained by an object is to be interpreted as null. + + Returns + ------- + bool + true if missing fields are interpreted as null. + """ + return self.options.get_missing_fields_as_nulls() + + def set_allow_single_quotes(self, bool val): + """ + Set whether single-quotes for strings are allowed. + + Parameters + ---------- + val : bool + Whether to allow single quotes + + Returns + ------- + None + """ + self.options.set_allow_single_quotes(val) + + def set_strip_quotes_from_single_strings(self, bool val): + """ + Set whether individually returned string values have their quotes stripped. + + Parameters + ---------- + val : bool + Whether to strip quotes from single strings. + + Returns + ------- + None + """ + self.options.set_strip_quotes_from_single_strings(val) + + def set_missing_fields_as_nulls(self, bool val): + """ + Set whether missing fields are interpreted as null. + + Parameters + ---------- + val : bool + Whether to treat missing fields as nulls. + + Returns + ------- + None + """ + self.options.set_missing_fields_as_nulls(val) + + +cpdef Column get_json_object( + Column col, + Scalar json_path, + GetJsonObjectOptions options=None +): + """ + Apply a JSONPath string to all rows in an input strings column. + + For details, see :cpp:func:`cudf::get_json_object` + + Parameters + ---------- + col : Column + The input strings column. Each row must contain a valid json string. + + json_path : Scalar + The JSONPath string to be applied to each row. + + options : GetJsonObjectOptions + Options for controlling the behavior of the function. + + Returns + ------- + Column + New strings column containing the retrieved json object strings. + """ + cdef unique_ptr[column] c_result + cdef string_scalar* c_json_path = ( + json_path.c_obj.get() + ) + if options is None: + options = GetJsonObjectOptions() + + cdef cpp_json.get_json_object_options c_options = options.options + + with nogil: + c_result = cpp_json.get_json_object( + col.view(), + dereference(c_json_path), + c_options + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd index 6f8797ae7d3..b1f9f2e806d 100644 --- a/python/pylibcudf/pylibcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/labeling.pxd @@ -8,7 +8,7 @@ from .column cimport Column cpdef Column label_bins( Column input, Column left_edges, - bool left_inclusive, + inclusive left_inclusive, Column right_edges, - bool right_inclusive + inclusive right_inclusive ) diff --git a/python/pylibcudf/pylibcudf/labeling.pyi b/python/pylibcudf/pylibcudf/labeling.pyi new file mode 100644 index 00000000000..c3a75d10baf --- /dev/null +++ b/python/pylibcudf/pylibcudf/labeling.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column + +class Inclusive(IntEnum): + YES = ... + NO = ... + +def label_bins( + input: Column, + left_edges: Column, + left_inclusive: Inclusive, + right_edges: Column, + right_inclusive: Inclusive, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index b3f6a92d85c..cae1830f6b9 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -10,13 +10,14 @@ from pylibcudf.libcudf.labeling import inclusive as Inclusive # no-cython-lint from .column cimport Column +__all__ = ["Inclusive", "label_bins"] cpdef Column label_bins( Column input, Column left_edges, - bool left_inclusive, + inclusive left_inclusive, Column right_edges, - bool right_inclusive + inclusive right_inclusive ): """Labels elements based on membership in the specified bins. @@ -28,11 +29,11 @@ cpdef Column label_bins( Column of input elements to label according to the specified bins. left_edges : Column Column of the left edge of each bin. - left_inclusive : bool + left_inclusive : Inclusive Whether or not the left edge is inclusive. right_edges : Column Column of the right edge of each bin. - right_inclusive : bool + right_inclusive : Inclusive Whether or not the right edge is inclusive. Returns @@ -42,26 +43,13 @@ cpdef Column label_bins( according to the specified bins. """ cdef unique_ptr[column] c_result - cdef inclusive c_left_inclusive = ( - inclusive.YES - if left_inclusive - else inclusive.NO - ) - cdef inclusive c_right_inclusive = ( - inclusive.YES - if right_inclusive - else inclusive.NO - ) - with nogil: - c_result = move( - cpp_labeling.label_bins( - input.view(), - left_edges.view(), - c_left_inclusive, - right_edges.view(), - c_right_inclusive, - ) + c_result = cpp_labeling.label_bins( + input.view(), + left_edges.view(), + left_inclusive, + right_edges.view(), + right_inclusive, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 2167616690f..00669ff579a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx - replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx +set(cython_sources + aggregation.pyx binaryop.pyx copying.pyx datetime.pyx expressions.pyx labeling.pyx reduce.pyx + replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) @@ -23,4 +24,5 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) add_subdirectory(io) +add_subdirectory(lists) add_subdirectory(strings) diff --git a/python/pylibcudf/pylibcudf/libcudf/aggregation.pxd b/python/pylibcudf/pylibcudf/libcudf/aggregation.pxd index 58c579b86de..52d1e572ff3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/aggregation.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/aggregation.pxd @@ -5,6 +5,7 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport ( data_type, interpolation, @@ -94,71 +95,78 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: ZERO_NORMALIZED ONE_NORMALIZED - cdef unique_ptr[T] make_sum_aggregation[T]() except + + cdef unique_ptr[T] make_sum_aggregation[T]() except +libcudf_exception_handler - cdef unique_ptr[T] make_product_aggregation[T]() except + + cdef unique_ptr[T] make_product_aggregation[T]() except +libcudf_exception_handler - cdef unique_ptr[T] make_min_aggregation[T]() except + + cdef unique_ptr[T] make_min_aggregation[T]() except +libcudf_exception_handler - cdef unique_ptr[T] make_max_aggregation[T]() except + + cdef unique_ptr[T] make_max_aggregation[T]() except +libcudf_exception_handler - cdef unique_ptr[T] make_count_aggregation[T](null_policy) except + + cdef unique_ptr[T] make_count_aggregation[T]( + null_policy + ) except +libcudf_exception_handler - cdef unique_ptr[T] make_any_aggregation[T]() except + + cdef unique_ptr[T] make_any_aggregation[T]() except +libcudf_exception_handler - cdef unique_ptr[T] make_all_aggregation[T]() except + + cdef unique_ptr[T] make_all_aggregation[T]() except +libcudf_exception_handler - cdef unique_ptr[T] make_sum_of_squares_aggregation[T]() except + + cdef unique_ptr[T] make_sum_of_squares_aggregation[T]()\ + except +libcudf_exception_handler - cdef unique_ptr[T] make_mean_aggregation[T]() except + + cdef unique_ptr[T] make_mean_aggregation[T]() except +libcudf_exception_handler cdef unique_ptr[T] make_variance_aggregation[T]( - size_type ddof) except + + size_type ddof) except +libcudf_exception_handler - cdef unique_ptr[T] make_std_aggregation[T](size_type ddof) except + + cdef unique_ptr[T] make_std_aggregation[T]( + size_type ddof + ) except +libcudf_exception_handler - cdef unique_ptr[T] make_median_aggregation[T]() except + + cdef unique_ptr[T] make_median_aggregation[T]() except +libcudf_exception_handler cdef unique_ptr[T] make_quantile_aggregation[T]( - vector[double] q, interpolation i) except + + vector[double] q, interpolation i) except +libcudf_exception_handler - cdef unique_ptr[T] make_argmax_aggregation[T]() except + + cdef unique_ptr[T] make_argmax_aggregation[T]() except +libcudf_exception_handler - cdef unique_ptr[T] make_argmin_aggregation[T]() except + + cdef unique_ptr[T] make_argmin_aggregation[T]() except +libcudf_exception_handler - cdef unique_ptr[T] make_nunique_aggregation[T](null_policy null_handling) except + + cdef unique_ptr[T] make_nunique_aggregation[T]( + null_policy null_handling + ) except +libcudf_exception_handler cdef unique_ptr[T] make_nth_element_aggregation[T]( size_type n, null_policy null_handling - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[T] make_collect_list_aggregation[T]( null_policy null_handling - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[T] make_collect_set_aggregation[T]( null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[T] make_udf_aggregation[T]( udf_type type, string user_defined_aggregator, - data_type output_type) except + + data_type output_type) except +libcudf_exception_handler cdef unique_ptr[T] make_ewma_aggregation[T]( double com, ewm_history adjust - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[T] make_correlation_aggregation[T]( - correlation_type type, size_type min_periods) except + + correlation_type type, size_type min_periods) except +libcudf_exception_handler cdef unique_ptr[T] make_covariance_aggregation[T]( - size_type min_periods, size_type ddof) except + + size_type min_periods, size_type ddof) except +libcudf_exception_handler cdef unique_ptr[T] make_rank_aggregation[T]( rank_method method, order column_order, null_policy null_handling, null_order null_precedence, - rank_percentage percentage) except + + rank_percentage percentage) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd index d39767b4aa8..607f7c2fa60 100644 --- a/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd @@ -1,5 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd index 7a369701bbd..0f412ba4765 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd @@ -1,15 +1,15 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column_view cimport ( column_view, mutable_column_view, ) from pylibcudf.libcudf.types cimport data_type, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil: @@ -19,15 +19,15 @@ cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil: vector[unique_ptr[column]] children cdef cppclass column: - column() except + - column(const column& other) except + + column() except +libcudf_exception_handler + column(const column& other) except +libcudf_exception_handler - column(column_view view) except + + column(column_view view) except +libcudf_exception_handler - size_type size() except + - size_type null_count() except + - bool has_nulls() except + - data_type type() except + - column_view view() except + - mutable_column_view mutable_view() except + - column_contents release() except + + size_type size() except +libcudf_exception_handler + size_type null_count() except +libcudf_exception_handler + bool has_nulls() except +libcudf_exception_handler + data_type type() except +libcudf_exception_handler + column_view view() except +libcudf_exception_handler + mutable_column_view mutable_view() except +libcudf_exception_handler + column_contents release() except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd index f1a326bcd40..162822d2365 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport ( @@ -11,72 +11,84 @@ from pylibcudf.libcudf.types cimport ( type_id, ) -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: - cdef unique_ptr[column] make_numeric_column(data_type type, - size_type size, - mask_state state) except + + cdef unique_ptr[column] make_numeric_column( + data_type type, + size_type size, + mask_state state + ) except +libcudf_exception_handler - cdef unique_ptr[column] make_numeric_column(data_type type, - size_type size, - device_buffer mask, - size_type null_count) except + + cdef unique_ptr[column] make_numeric_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count + ) except +libcudf_exception_handler cdef unique_ptr[column] make_fixed_point_column( data_type type, size_type size, - mask_state state) except + + mask_state state) except +libcudf_exception_handler cdef unique_ptr[column] make_fixed_point_column( data_type type, size_type size, device_buffer mask, - size_type null_count) except + + size_type null_count) except +libcudf_exception_handler cdef unique_ptr[column] make_timestamp_column( data_type type, size_type size, - mask_state state) except + + mask_state state) except +libcudf_exception_handler cdef unique_ptr[column] make_timestamp_column( data_type type, size_type size, device_buffer mask, - size_type null_count) except + + size_type null_count) except +libcudf_exception_handler cdef unique_ptr[column] make_duration_column( data_type type, size_type size, - mask_state state) except + + mask_state state) except +libcudf_exception_handler cdef unique_ptr[column] make_duration_column( data_type type, size_type size, device_buffer mask, - size_type null_count) except + + size_type null_count) except +libcudf_exception_handler cdef unique_ptr[column] make_fixed_width_column( data_type type, size_type size, - mask_state state) except + + mask_state state) except +libcudf_exception_handler cdef unique_ptr[column] make_fixed_width_column( data_type type, size_type size, device_buffer mask, - size_type null_count) except + + size_type null_count) except +libcudf_exception_handler - cdef unique_ptr[column] make_column_from_scalar(const scalar& s, - size_type size) except + + cdef unique_ptr[column] make_column_from_scalar( + const scalar& s, + size_type size + ) except +libcudf_exception_handler - cdef unique_ptr[column] make_dictionary_from_scalar(const scalar& s, - size_type size) except + + cdef unique_ptr[column] make_dictionary_from_scalar( + const scalar& s, + size_type size + ) except +libcudf_exception_handler - cdef unique_ptr[column] make_empty_column(type_id id) except + - cdef unique_ptr[column] make_empty_column(data_type type_) except + + cdef unique_ptr[column] make_empty_column( + type_id id + ) except +libcudf_exception_handler + cdef unique_ptr[column] make_empty_column( + data_type type_ + ) except +libcudf_exception_handler cdef unique_ptr[column] make_dictionary_column( unique_ptr[column] keys_column, - unique_ptr[column] indices_column) except + + unique_ptr[column] indices_column) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd index c0e971eb5bd..105bea7b8ef 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd @@ -1,29 +1,29 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil: cdef cppclass column_view: - column_view() except + - column_view(const column_view& other) except + + column_view() except +libcudf_exception_handler + column_view(const column_view& other) except +libcudf_exception_handler - column_view& operator=(const column_view&) except + + column_view& operator=(const column_view&) except +libcudf_exception_handler column_view( data_type type, size_type size, const void* data - ) except + + ) except +libcudf_exception_handler column_view( data_type type, size_type size, const void* data, const bitmask_type* null_mask - ) except + + ) except +libcudf_exception_handler column_view( data_type type, @@ -31,7 +31,7 @@ cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil: const void* data, const bitmask_type* null_mask, size_type null_count - ) except + + ) except +libcudf_exception_handler column_view( data_type type, @@ -40,7 +40,7 @@ cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil: const bitmask_type* null_mask, size_type null_count, size_type offset - ) except + + ) except +libcudf_exception_handler column_view( data_type type, @@ -50,37 +50,41 @@ cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil: size_type null_count, size_type offset, vector[column_view] children - ) except + - - const T* data[T]() except + - const T* head[T]() except + - const bitmask_type* null_mask() except + - size_type size() except + - data_type type() except + - bool nullable() except + - size_type null_count() except + - bool has_nulls() except + - size_type offset() except + - size_type num_children() except + - column_view child(size_type) except + + ) except +libcudf_exception_handler + + const T* data[T]() except +libcudf_exception_handler + const T* head[T]() except +libcudf_exception_handler + const bitmask_type* null_mask() except +libcudf_exception_handler + size_type size() except +libcudf_exception_handler + data_type type() except +libcudf_exception_handler + bool nullable() except +libcudf_exception_handler + size_type null_count() except +libcudf_exception_handler + bool has_nulls() except +libcudf_exception_handler + size_type offset() except +libcudf_exception_handler + size_type num_children() except +libcudf_exception_handler + column_view child(size_type) except +libcudf_exception_handler cdef cppclass mutable_column_view: - mutable_column_view() except + - mutable_column_view(const mutable_column_view&) except + - mutable_column_view& operator=(const mutable_column_view&) except + + mutable_column_view() except +libcudf_exception_handler + mutable_column_view( + const mutable_column_view& + ) except +libcudf_exception_handler + mutable_column_view& operator=( + const mutable_column_view& + ) except +libcudf_exception_handler mutable_column_view( data_type type, size_type size, const void* data - ) except + + ) except +libcudf_exception_handler mutable_column_view( data_type type, size_type size, const void* data, const bitmask_type* null_mask - ) except + + ) except +libcudf_exception_handler mutable_column_view( data_type type, @@ -88,7 +92,7 @@ cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil: const void* data, const bitmask_type* null_mask, size_type null_count - ) except + + ) except +libcudf_exception_handler mutable_column_view( data_type type, @@ -97,22 +101,22 @@ cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil: const bitmask_type* null_mask, size_type null_count, size_type offset - ) except + + ) except +libcudf_exception_handler mutable_column_view( data_type type, size_type size, const void* data, const bitmask_type* null_mask, size_type null_count, size_type offset, vector[mutable_column_view] children - ) except + - - T* data[T]() except + - T* head[T]() except + - bitmask_type* null_mask() except + - size_type size() except + - data_type type() except + - bool nullable() except + - size_type null_count() except + - bool has_nulls() except + - size_type offset() except + - size_type num_children() except + - mutable_column_view& child(size_type) except + + ) except +libcudf_exception_handler + + T* data[T]() except +libcudf_exception_handler + T* head[T]() except +libcudf_exception_handler + bitmask_type* null_mask() except +libcudf_exception_handler + size_type size() except +libcudf_exception_handler + data_type type() except +libcudf_exception_handler + bool nullable() except +libcudf_exception_handler + size_type null_count() except +libcudf_exception_handler + bool has_nulls() except +libcudf_exception_handler + size_type offset() except +libcudf_exception_handler + size_type num_children() except +libcudf_exception_handler + mutable_column_view& child(size_type) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd index 92f5a185a54..0a827b21cda 100644 --- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd @@ -1,12 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.table.table cimport table, table_view -from pylibcudf.libcudf.utilities.host_span cimport host_span +from pylibcudf.libcudf.utilities.span cimport host_span -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil: @@ -15,7 +15,13 @@ cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil: # constructable from a vector. In case they are needed in the future, # host_span versions can be added, e.g: # - # cdef unique_ptr[column] concatenate(host_span[column_view] columns) except + + # cdef unique_ptr[column] concatenate( + # host_span[column_view] columns + # ) except +libcudf_exception_handler - cdef unique_ptr[column] concatenate(const vector[column_view] columns) except + - cdef unique_ptr[table] concatenate(const vector[table_view] tables) except + + cdef unique_ptr[column] concatenate( + const vector[column_view] columns + ) except +libcudf_exception_handler + cdef unique_ptr[table] concatenate( + const vector[table_view] tables + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd index cadac6a0022..9df828015eb 100644 --- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd @@ -1,12 +1,12 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. - from libc.stdint cimport uint8_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil: @@ -21,8 +21,17 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil: cdef vector[contiguous_split_result] contiguous_split ( table_view input_table, vector[size_type] splits - ) except + + ) except +libcudf_exception_handler + + cdef packed_columns pack ( + const table_view& input + ) except +libcudf_exception_handler - cdef packed_columns pack (const table_view& input) except + + cdef table_view unpack ( + const packed_columns& input + ) except +libcudf_exception_handler - cdef table_view unpack (const packed_columns& input) except + + cdef table_view unpack ( + const uint8_t* metadata, + const uint8_t* gpu_data + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd index 4d4a4ba9b89..5a05284e86a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd @@ -1,5 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t, int64_t, uint8_t from libcpp cimport bool from libcpp.functional cimport reference_wrapper @@ -16,7 +15,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer ctypedef const scalar constscalar diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd index a4465343197..049a1b06c2e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd @@ -1,56 +1,100 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t, uint8_t from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: - cdef unique_ptr[column] extract_year(const column_view& column) except + - cdef unique_ptr[column] extract_month(const column_view& column) except + - cdef unique_ptr[column] extract_day(const column_view& column) except + - cdef unique_ptr[column] extract_weekday(const column_view& column) except + - cdef unique_ptr[column] extract_hour(const column_view& column) except + - cdef unique_ptr[column] extract_minute(const column_view& column) except + - cdef unique_ptr[column] extract_second(const column_view& column) except + + cpdef enum class datetime_component(uint8_t): + YEAR + MONTH + DAY + WEEKDAY + HOUR + MINUTE + SECOND + MILLISECOND + MICROSECOND + NANOSECOND + + cdef unique_ptr[column] extract_year( + const column_view& column + ) except +libcudf_exception_handler + cdef unique_ptr[column] extract_month( + const column_view& column + ) except +libcudf_exception_handler + cdef unique_ptr[column] extract_day( + const column_view& column + ) except +libcudf_exception_handler + cdef unique_ptr[column] extract_weekday( + const column_view& column + ) except +libcudf_exception_handler + cdef unique_ptr[column] extract_hour( + const column_view& column + ) except +libcudf_exception_handler + cdef unique_ptr[column] extract_minute( + const column_view& column + ) except +libcudf_exception_handler + cdef unique_ptr[column] extract_second( + const column_view& column + ) except +libcudf_exception_handler cdef unique_ptr[column] extract_millisecond_fraction( const column_view& column - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] extract_microsecond_fraction( const column_view& column - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] extract_nanosecond_fraction( const column_view& column - ) except + + ) except +libcudf_exception_handler + cdef unique_ptr[column] extract_datetime_component( + const column_view& column, + datetime_component component + ) except +libcudf_exception_handler - ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": - DAY "cudf::datetime::rounding_frequency::DAY" - HOUR "cudf::datetime::rounding_frequency::HOUR" - MINUTE "cudf::datetime::rounding_frequency::MINUTE" - SECOND "cudf::datetime::rounding_frequency::SECOND" - MILLISECOND "cudf::datetime::rounding_frequency::MILLISECOND" - MICROSECOND "cudf::datetime::rounding_frequency::MICROSECOND" - NANOSECOND "cudf::datetime::rounding_frequency::NANOSECOND" + cpdef enum class rounding_frequency(int32_t): + DAY + HOUR + MINUTE + SECOND + MILLISECOND + MICROSECOND + NANOSECOND cdef unique_ptr[column] ceil_datetimes( const column_view& column, rounding_frequency freq - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] floor_datetimes( const column_view& column, rounding_frequency freq - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] round_datetimes( const column_view& column, rounding_frequency freq - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] add_calendrical_months( const column_view& timestamps, const column_view& months - ) except + - cdef unique_ptr[column] day_of_year(const column_view& column) except + - cdef unique_ptr[column] is_leap_year(const column_view& column) except + + ) except +libcudf_exception_handler + cdef unique_ptr[column] add_calendrical_months( + const column_view& timestamps, + const scalar& months + ) except +libcudf_exception_handler + cdef unique_ptr[column] day_of_year( + const column_view& column + ) except +libcudf_exception_handler + cdef unique_ptr[column] is_leap_year( + const column_view& column + ) except +libcudf_exception_handler cdef unique_ptr[column] last_day_of_month( const column_view& column - ) except + - cdef unique_ptr[column] extract_quarter(const column_view& column) except + - cdef unique_ptr[column] days_in_month(const column_view& column) except + + ) except +libcudf_exception_handler + cdef unique_ptr[column] extract_quarter( + const column_view& column + ) except +libcudf_exception_handler + cdef unique_ptr[column] days_in_month( + const column_view& column + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pyx b/python/pylibcudf/pylibcudf/libcudf/datetime.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/experimental.pxd b/python/pylibcudf/pylibcudf/libcudf/experimental.pxd index f280a382a04..764815fba36 100644 --- a/python/pylibcudf/pylibcudf/libcudf/experimental.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/experimental.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler cdef extern from "cudf/utilities/prefetch.hpp" \ diff --git a/python/pylibcudf/pylibcudf/libcudf/expressions.pxd b/python/pylibcudf/pylibcudf/libcudf/expressions.pxd index 5ba2dff6074..0e42d2bd02c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/expressions.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/expressions.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.scalar.scalar cimport ( duration_scalar, @@ -75,15 +75,15 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: cdef cppclass literal(expression): # Due to https://github.com/cython/cython/issues/3198, we need to # specify a return type for templated constructors. - literal literal[T](numeric_scalar[T] &) except + - literal literal[T](timestamp_scalar[T] &) except + - literal literal[T](duration_scalar[T] &) except + + literal literal[T](numeric_scalar[T] &) except +libcudf_exception_handler + literal literal[T](timestamp_scalar[T] &) except +libcudf_exception_handler + literal literal[T](duration_scalar[T] &) except +libcudf_exception_handler cdef cppclass column_reference(expression): # Allow for default C++ parameters by declaring multiple constructors # with the default parameters optionally omitted. - column_reference(size_type) except + - column_reference(size_type, table_reference) except + + column_reference(size_type) except +libcudf_exception_handler + column_reference(size_type, table_reference) except +libcudf_exception_handler cdef cppclass operation(expression): operation(ast_operator, const expression &) @@ -92,4 +92,4 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: cdef cppclass column_name_reference(expression): # column_name_reference is only meant for use in file I/O such as the # Parquet reader. - column_name_reference(string) except + + column_name_reference(string) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/filling.pxd index 7bed80050d2..f0bfe8ca80b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/filling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/filling.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport ( column_view, @@ -19,33 +19,33 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil: size_type begin, size_type end, const scalar & value - ) except + + ) except +libcudf_exception_handler cdef void fill_in_place( const mutable_column_view & destination, size_type beign, size_type end, const scalar & value - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[table] repeat( const table_view & input, const column_view & count, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[table] repeat( const table_view & input, size_type count - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] sequence( size_type size, const scalar & init, const scalar & step - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] calendrical_month_sequence( size_type n, const scalar& init, size_type months, - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd new file mode 100644 index 00000000000..a4461f34ab2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t +from pylibcudf.exception_handler cimport libcudf_exception_handler + + +cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil: + cdef cppclass scale_type: + scale_type(int32_t) diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd index 848462131fe..cbbc174d7bf 100644 --- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd @@ -1,10 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.functional cimport reference_wrapper from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.aggregation cimport ( groupby_aggregation, groupby_scan_aggregation, @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport ( size_type, sorted, ) -from pylibcudf.libcudf.utilities.host_span cimport host_span # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar @@ -32,12 +31,12 @@ cdef extern from "cudf/groupby.hpp" \ namespace "cudf::groupby" nogil: cdef cppclass aggregation_request: - aggregation_request() except + + aggregation_request() except +libcudf_exception_handler column_view values vector[unique_ptr[groupby_aggregation]] aggregations cdef cppclass scan_request: - scan_request() except + + scan_request() except +libcudf_exception_handler column_view values vector[unique_ptr[groupby_scan_aggregation]] aggregations @@ -51,24 +50,24 @@ cdef extern from "cudf/groupby.hpp" \ unique_ptr[table] values cdef cppclass groupby: - groupby(const table_view& keys) except + + groupby(const table_view& keys) except +libcudf_exception_handler groupby( const table_view& keys, null_policy include_null_keys - ) except + + ) except +libcudf_exception_handler groupby( const table_view& keys, null_policy include_null_keys, sorted keys_are_sorted, - ) except + + ) except +libcudf_exception_handler groupby( const table_view& keys, null_policy include_null_keys, sorted keys_are_sorted, const vector[order]& column_order, - ) except + + ) except +libcudf_exception_handler groupby( const table_view& keys, @@ -76,21 +75,21 @@ cdef extern from "cudf/groupby.hpp" \ sorted keys_are_sorted, const vector[order]& column_order, const vector[null_order]& null_precedence - ) except + + ) except +libcudf_exception_handler pair[ unique_ptr[table], vector[aggregation_result] ] aggregate( const vector[aggregation_request]& requests, - ) except + + ) except +libcudf_exception_handler pair[ unique_ptr[table], vector[aggregation_result] ] scan( const vector[scan_request]& requests, - ) except + + ) except +libcudf_exception_handler pair[ unique_ptr[table], @@ -99,12 +98,12 @@ cdef extern from "cudf/groupby.hpp" \ const table_view values, const vector[size_type] offset, const vector[reference_wrapper[constscalar]] fill_values - ) except + + ) except +libcudf_exception_handler - groups get_groups() except + - groups get_groups(table_view values) except + + groups get_groups() except +libcudf_exception_handler + groups get_groups(table_view values) except +libcudf_exception_handler pair[unique_ptr[table], unique_ptr[table]] replace_nulls( const table_view& values, const vector[replace_policy] replace_policy - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd index 51678ba69d8..4e8a01b41a5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view @@ -10,36 +10,44 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: - cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" ( + cdef unique_ptr[column] murmurhash3_x86_32( const table_view& input, const uint32_t seed - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] md5 "cudf::hashing::md5" ( + cdef unique_ptr[table] murmurhash3_x64_128( + const table_view& input, + const uint64_t seed + ) except +libcudf_exception_handler + + cdef unique_ptr[column] md5( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha1 "cudf::hashing::sha1" ( + cdef unique_ptr[column] sha1( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha224 "cudf::hashing::sha224" ( + cdef unique_ptr[column] sha224( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha256 "cudf::hashing::sha256" ( + cdef unique_ptr[column] sha256( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha384 "cudf::hashing::sha384" ( + cdef unique_ptr[column] sha384( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha512 "cudf::hashing::sha512" ( + cdef unique_ptr[column] sha512( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" ( + cdef unique_ptr[column] xxhash_64( const table_view& input, const uint64_t seed - ) except + + ) except +libcudf_exception_handler + +cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: + cdef uint32_t DEFAULT_HASH_SEED diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pyx b/python/pylibcudf/pylibcudf/libcudf/hash.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd index 30b97fdec34..8953357a087 100644 --- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport scalar @@ -12,19 +12,19 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "dlpack/dlpack.h" nogil: ctypedef struct DLManagedTensor: - void(*deleter)(DLManagedTensor*) except + + void(*deleter)(DLManagedTensor*) except +libcudf_exception_handler # The Arrow structs are not namespaced. cdef extern from "cudf/interop.hpp" nogil: cdef struct ArrowSchema: - void (*release)(ArrowSchema*) noexcept nogil + void (*release)(ArrowSchema*) noexcept cdef struct ArrowArray: - void (*release)(ArrowArray*) noexcept nogil + void (*release)(ArrowArray*) noexcept cdef struct ArrowArrayStream: - void (*release)(ArrowArrayStream*) noexcept nogil + void (*release)(ArrowArrayStream*) noexcept cdef struct ArrowDeviceArray: ArrowArray array @@ -32,23 +32,27 @@ cdef extern from "cudf/interop.hpp" nogil: cdef extern from "cudf/interop.hpp" namespace "cudf" \ nogil: - cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor - ) except + + cdef unique_ptr[table] from_dlpack( + const DLManagedTensor* managed_tensor + ) except +libcudf_exception_handler - DLManagedTensor* to_dlpack(table_view input_table - ) except + + DLManagedTensor* to_dlpack( + const table_view& input + ) except +libcudf_exception_handler cdef cppclass column_metadata: - column_metadata() except + - column_metadata(string name_) except + + column_metadata() except +libcudf_exception_handler + column_metadata(string name_) except +libcudf_exception_handler string name vector[column_metadata] children_meta - cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except + + cdef unique_ptr[table] from_arrow_stream( + ArrowArrayStream* input + ) except +libcudf_exception_handler cdef unique_ptr[column] from_arrow_column( const ArrowSchema* schema, const ArrowArray* input - ) except + + ) except +libcudf_exception_handler cdef extern from *: @@ -82,5 +86,7 @@ cdef extern from *: cdef ArrowSchema *to_arrow_schema_raw( const table_view& tbl, const vector[column_metadata]& metadata, - ) except + nogil - cdef ArrowArray* to_arrow_host_raw(const table_view& tbl) except + nogil + ) except +libcudf_exception_handler nogil + cdef ArrowArray* to_arrow_host_raw( + const table_view& tbl + ) except +libcudf_exception_handler nogil diff --git a/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd index 2d76e2f6c80..cac55640ac9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.io.types as cudf_io_types from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport size_type @@ -10,34 +10,40 @@ cdef extern from "cudf/io/avro.hpp" \ namespace "cudf::io" nogil: cdef cppclass avro_reader_options: - avro_reader_options() except + - cudf_io_types.source_info get_source() except + - vector[string] get_columns() except + - size_type get_skip_rows() except + - size_type get_num_rows() except + + avro_reader_options() except +libcudf_exception_handler + cudf_io_types.source_info get_source() except +libcudf_exception_handler + vector[string] get_columns() except +libcudf_exception_handler + size_type get_skip_rows() except +libcudf_exception_handler + size_type get_num_rows() except +libcudf_exception_handler # setters - void set_columns(vector[string] col_names) except + - void set_skip_rows(size_type val) except + - void set_num_rows(size_type val) except + + void set_columns(vector[string] col_names) except +libcudf_exception_handler + void set_skip_rows(size_type val) except +libcudf_exception_handler + void set_num_rows(size_type val) except +libcudf_exception_handler @staticmethod avro_reader_options_builder builder( cudf_io_types.source_info src - ) except + + ) except +libcudf_exception_handler cdef cppclass avro_reader_options_builder: - avro_reader_options_builder() except + + avro_reader_options_builder() except +libcudf_exception_handler avro_reader_options_builder( cudf_io_types.source_info src - ) except + - avro_reader_options_builder& columns(vector[string] col_names) except + - avro_reader_options_builder& skip_rows(size_type val) except + - avro_reader_options_builder& num_rows(size_type val) except + - - avro_reader_options build() except + + ) except +libcudf_exception_handler + avro_reader_options_builder& columns( + vector[string] col_names + ) except +libcudf_exception_handler + avro_reader_options_builder& skip_rows( + size_type val + ) except +libcudf_exception_handler + avro_reader_options_builder& num_rows( + size_type val + ) except +libcudf_exception_handler + + avro_reader_options build() except +libcudf_exception_handler cdef cudf_io_types.table_with_metadata read_avro( avro_reader_options &options - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd index 73a6d98650c..7ca158016a2 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd @@ -1,5 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.io.types as cudf_io_types cimport pylibcudf.libcudf.table.table_view as cudf_table_view from libc.stdint cimport uint8_t @@ -8,6 +7,7 @@ from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport data_type, size_type @@ -15,227 +15,319 @@ cdef extern from "cudf/io/csv.hpp" \ namespace "cudf::io" nogil: cdef cppclass csv_reader_options: - csv_reader_options() except + + csv_reader_options() except +libcudf_exception_handler # Getter - cudf_io_types.source_info get_source() except + + cudf_io_types.source_info get_source() except +libcudf_exception_handler # Reader settings - cudf_io_types.compression_type get_compression() except + - size_t get_byte_range_offset() except + - size_t get_byte_range_size() except + - vector[string] get_names() except + - string get_prefix() except + - bool is_enabled_mangle_dupe_cols() except + + cudf_io_types.compression_type get_compression()\ + except +libcudf_exception_handler + size_t get_byte_range_offset() except +libcudf_exception_handler + size_t get_byte_range_size() except +libcudf_exception_handler + vector[string] get_names() except +libcudf_exception_handler + string get_prefix() except +libcudf_exception_handler + bool is_enabled_mangle_dupe_cols() except +libcudf_exception_handler # Filter settings - vector[string] get_use_cols_names() except + - vector[int] get_use_cols_indexes() except + - size_type get_nrows() except + - size_type get_skiprows() except + - size_type get_skipfooter() except + - size_type get_header() except + + vector[string] get_use_cols_names() except +libcudf_exception_handler + vector[int] get_use_cols_indexes() except +libcudf_exception_handler + size_type get_nrows() except +libcudf_exception_handler + size_type get_skiprows() except +libcudf_exception_handler + size_type get_skipfooter() except +libcudf_exception_handler + size_type get_header() except +libcudf_exception_handler # Parsing settings - char get_lineterminator() except + - char get_delimiter() except + - char get_thousands() except + - char get_decimal() except + - char get_comment() except + - bool is_enabled_windowslinetermination() except + - bool is_enabled_delim_whitespace() except + - bool is_enabled_skipinitialspace() except + - bool is_enabled_skip_blank_lines() except + - cudf_io_types.quote_style get_quoting() except + - char get_quotechar() except + - bool is_enabled_doublequote() except + - bool is_enabled_updated_quotes_detection() except + - vector[string] get_parse_dates_names() except + - vector[int] get_parse_dates_indexes() except + - vector[string] get_parse_hex_names() except + - vector[int] get_parse_hex_indexes() except + + char get_lineterminator() except +libcudf_exception_handler + char get_delimiter() except +libcudf_exception_handler + char get_thousands() except +libcudf_exception_handler + char get_decimal() except +libcudf_exception_handler + char get_comment() except +libcudf_exception_handler + bool is_enabled_windowslinetermination() except +libcudf_exception_handler + bool is_enabled_delim_whitespace() except +libcudf_exception_handler + bool is_enabled_skipinitialspace() except +libcudf_exception_handler + bool is_enabled_skip_blank_lines() except +libcudf_exception_handler + cudf_io_types.quote_style get_quoting() except +libcudf_exception_handler + char get_quotechar() except +libcudf_exception_handler + bool is_enabled_doublequote() except +libcudf_exception_handler + bool is_enabled_updated_quotes_detection() except +libcudf_exception_handler + vector[string] get_parse_dates_names() except +libcudf_exception_handler + vector[int] get_parse_dates_indexes() except +libcudf_exception_handler + vector[string] get_parse_hex_names() except +libcudf_exception_handler + vector[int] get_parse_hex_indexes() except +libcudf_exception_handler # Conversion settings - vector[string] get_dtype() except + - vector[string] get_true_values() except + - vector[string] get_false_values() except + - vector[string] get_na_values() except + - bool is_enabled_keep_default_na() except + - bool is_enabled_na_filter() except + - bool is_enabled_dayfirst() except + + vector[string] get_dtype() except +libcudf_exception_handler + vector[string] get_true_values() except +libcudf_exception_handler + vector[string] get_false_values() except +libcudf_exception_handler + vector[string] get_na_values() except +libcudf_exception_handler + bool is_enabled_keep_default_na() except +libcudf_exception_handler + bool is_enabled_na_filter() except +libcudf_exception_handler + bool is_enabled_dayfirst() except +libcudf_exception_handler # setter # Reader settings - void set_compression(cudf_io_types.compression_type comp) except + - void set_byte_range_offset(size_t val) except + - void set_byte_range_size(size_t val) except + - void set_names(vector[string] val) except + - void set_prefix(string pfx) except + - void set_mangle_dupe_cols(bool val) except + + void set_compression( + cudf_io_types.compression_type comp + ) except +libcudf_exception_handler + void set_byte_range_offset(size_t val) except +libcudf_exception_handler + void set_byte_range_size(size_t val) except +libcudf_exception_handler + void set_names(vector[string] val) except +libcudf_exception_handler + void set_prefix(string pfx) except +libcudf_exception_handler + void set_mangle_dupe_cols(bool val) except +libcudf_exception_handler # Filter settings - void set_use_cols_names(vector[string] col_names) except + - void set_use_cols_indexes(vector[int] col_ind) except + - void set_nrows(size_type n_rows) except + - void set_skiprows(size_type val) except + - void set_skipfooter(size_type val) except + - void set_header(size_type hdr) except + + void set_use_cols_names( + vector[string] col_names + ) except +libcudf_exception_handler + void set_use_cols_indexes( + vector[int] col_ind + ) except +libcudf_exception_handler + void set_nrows(size_type n_rows) except +libcudf_exception_handler + void set_skiprows(size_type val) except +libcudf_exception_handler + void set_skipfooter(size_type val) except +libcudf_exception_handler + void set_header(size_type hdr) except +libcudf_exception_handler # Parsing settings - void set_lineterminator(char val) except + - void set_delimiter(char val) except + - void set_thousands(char val) except + - void set_decimal(char val) except + - void set_comment(char val) except + - void enable_windowslinetermination(bool val) except + - void enable_delim_whitespace(bool val) except + - void enable_skipinitialspace(bool val) except + - void enable_skip_blank_lines(bool val) except + - void set_quoting(cudf_io_types.quote_style style) except + - void set_quotechar(char val) except + - void set_doublequote(bool val) except + - void set_detect_whitespace_around_quotes(bool val) except + - void set_parse_dates(vector[string]) except + - void set_parse_dates(vector[int]) except + - void set_parse_hex(vector[string]) except + - void set_parse_hex(vector[int]) except + + void set_lineterminator(char val) except +libcudf_exception_handler + void set_delimiter(char val) except +libcudf_exception_handler + void set_thousands(char val) except +libcudf_exception_handler + void set_decimal(char val) except +libcudf_exception_handler + void set_comment(char val) except +libcudf_exception_handler + void enable_windowslinetermination(bool val) except +libcudf_exception_handler + void enable_delim_whitespace(bool val) except +libcudf_exception_handler + void enable_skipinitialspace(bool val) except +libcudf_exception_handler + void enable_skip_blank_lines(bool val) except +libcudf_exception_handler + void set_quoting( + cudf_io_types.quote_style style + ) except +libcudf_exception_handler + void set_quotechar(char val) except +libcudf_exception_handler + void set_doublequote(bool val) except +libcudf_exception_handler + void set_detect_whitespace_around_quotes( + bool val + ) except +libcudf_exception_handler + void set_parse_dates(vector[string]) except +libcudf_exception_handler + void set_parse_dates(vector[int]) except +libcudf_exception_handler + void set_parse_hex(vector[string]) except +libcudf_exception_handler + void set_parse_hex(vector[int]) except +libcudf_exception_handler # Conversion settings - void set_dtypes(vector[data_type] types) except + - void set_dtypes(map[string, data_type] types) except + - void set_true_values(vector[string] vals) except + - void set_false_values(vector[string] vals) except + - void set_na_values(vector[string] vals) except + - void enable_keep_default_na(bool val) except + - void enable_na_filter(bool val) except + - void enable_dayfirst(bool val) except + - void set_timestamp_type(data_type type) except + + void set_dtypes(vector[data_type] types) except +libcudf_exception_handler + void set_dtypes(map[string, data_type] types) except +libcudf_exception_handler + void set_true_values(vector[string] vals) except +libcudf_exception_handler + void set_false_values(vector[string] vals) except +libcudf_exception_handler + void set_na_values(vector[string] vals) except +libcudf_exception_handler + void enable_keep_default_na(bool val) except +libcudf_exception_handler + void enable_na_filter(bool val) except +libcudf_exception_handler + void enable_dayfirst(bool val) except +libcudf_exception_handler + void set_timestamp_type(data_type type) except +libcudf_exception_handler @staticmethod csv_reader_options_builder builder( cudf_io_types.source_info src - ) except + + ) except +libcudf_exception_handler cdef cppclass csv_reader_options_builder: - csv_reader_options_builder() except + + csv_reader_options_builder() except +libcudf_exception_handler csv_reader_options_builder( cudf_io_types.source_info src - ) except + + ) except +libcudf_exception_handler csv_reader_options_builder& source( cudf_io_types.source_info info - ) except + + ) except +libcudf_exception_handler # Reader settings csv_reader_options_builder& compression( cudf_io_types.compression_type comp - ) except + - csv_reader_options_builder& byte_range_offset(size_t val) except + - csv_reader_options_builder& byte_range_size(size_t val) except + - csv_reader_options_builder& names(vector[string] val) except + - csv_reader_options_builder& prefix(string pfx) except + - csv_reader_options_builder& mangle_dupe_cols(bool val) except + + ) except +libcudf_exception_handler + csv_reader_options_builder& byte_range_offset( + size_t val + ) except +libcudf_exception_handler + csv_reader_options_builder& byte_range_size( + size_t val + ) except +libcudf_exception_handler + csv_reader_options_builder& names( + vector[string] val + ) except +libcudf_exception_handler + csv_reader_options_builder& prefix( + string pfx + ) except +libcudf_exception_handler + csv_reader_options_builder& mangle_dupe_cols( + bool val + ) except +libcudf_exception_handler # Filter settings csv_reader_options_builder& use_cols_names( vector[string] col_names - ) except + + ) except +libcudf_exception_handler csv_reader_options_builder& use_cols_indexes( vector[int] col_ind - ) except + - csv_reader_options_builder& nrows(size_type n_rows) except + - csv_reader_options_builder& skiprows(size_type val) except + - csv_reader_options_builder& skipfooter(size_type val) except + - csv_reader_options_builder& header(size_type hdr) except + + ) except +libcudf_exception_handler + csv_reader_options_builder& nrows( + size_type n_rows + ) except +libcudf_exception_handler + csv_reader_options_builder& skiprows( + size_type val + ) except +libcudf_exception_handler + csv_reader_options_builder& skipfooter( + size_type val + ) except +libcudf_exception_handler + csv_reader_options_builder& header( + size_type hdr + ) except +libcudf_exception_handler # Parsing settings - csv_reader_options_builder& lineterminator(char val) except + - csv_reader_options_builder& delimiter(char val) except + - csv_reader_options_builder& thousands(char val) except + - csv_reader_options_builder& decimal(char val) except + - csv_reader_options_builder& comment(char val) except + - csv_reader_options_builder& windowslinetermination(bool val) except + - csv_reader_options_builder& delim_whitespace(bool val) except + - csv_reader_options_builder& skipinitialspace(bool val) except + - csv_reader_options_builder& skip_blank_lines(bool val) except + + csv_reader_options_builder& lineterminator( + char val + ) except +libcudf_exception_handler + csv_reader_options_builder& delimiter( + char val + ) except +libcudf_exception_handler + csv_reader_options_builder& thousands( + char val + ) except +libcudf_exception_handler + csv_reader_options_builder& decimal( + char val + ) except +libcudf_exception_handler + csv_reader_options_builder& comment( + char val + ) except +libcudf_exception_handler + csv_reader_options_builder& windowslinetermination( + bool val + ) except +libcudf_exception_handler + csv_reader_options_builder& delim_whitespace( + bool val + ) except +libcudf_exception_handler + csv_reader_options_builder& skipinitialspace( + bool val + ) except +libcudf_exception_handler + csv_reader_options_builder& skip_blank_lines( + bool val + ) except +libcudf_exception_handler csv_reader_options_builder& quoting( cudf_io_types.quote_style style - ) except + - csv_reader_options_builder& quotechar(char val) except + - csv_reader_options_builder& doublequote(bool val) except + - csv_reader_options_builder& detect_whitespace_around_quotes(bool val) except + - csv_reader_options_builder& parse_dates(vector[string]) except + - csv_reader_options_builder& parse_dates(vector[int]) except + + ) except +libcudf_exception_handler + csv_reader_options_builder& quotechar( + char val + ) except +libcudf_exception_handler + csv_reader_options_builder& doublequote( + bool val + ) except +libcudf_exception_handler + csv_reader_options_builder& detect_whitespace_around_quotes( + bool val + ) except +libcudf_exception_handler + csv_reader_options_builder& parse_dates( + vector[string] + ) except +libcudf_exception_handler + csv_reader_options_builder& parse_dates( + vector[int] + ) except +libcudf_exception_handler # Conversion settings - csv_reader_options_builder& dtypes(vector[string] types) except + - csv_reader_options_builder& dtypes(vector[data_type] types) except + + csv_reader_options_builder& dtypes( + vector[string] types) except +libcudf_exception_handler + csv_reader_options_builder& dtypes( + vector[data_type] types + ) except +libcudf_exception_handler csv_reader_options_builder& dtypes( map[string, data_type] types - ) except + - csv_reader_options_builder& true_values(vector[string] vals) except + - csv_reader_options_builder& false_values(vector[string] vals) except + - csv_reader_options_builder& na_values(vector[string] vals) except + - csv_reader_options_builder& keep_default_na(bool val) except + - csv_reader_options_builder& na_filter(bool val) except + - csv_reader_options_builder& dayfirst(bool val) except + - csv_reader_options_builder& timestamp_type(data_type type) except + + ) except +libcudf_exception_handler + csv_reader_options_builder& true_values( + vector[string] vals + ) except +libcudf_exception_handler + csv_reader_options_builder& false_values( + vector[string] vals + ) except +libcudf_exception_handler + csv_reader_options_builder& na_values( + vector[string] vals + ) except +libcudf_exception_handler + csv_reader_options_builder& keep_default_na( + bool val + ) except +libcudf_exception_handler + csv_reader_options_builder& na_filter( + bool val + ) except +libcudf_exception_handler + csv_reader_options_builder& dayfirst( + bool val + ) except +libcudf_exception_handler + csv_reader_options_builder& timestamp_type( + data_type type + ) except +libcudf_exception_handler - csv_reader_options build() except + + csv_reader_options build() except +libcudf_exception_handler cdef cudf_io_types.table_with_metadata read_csv( csv_reader_options &options - ) except + + ) except +libcudf_exception_handler cdef cppclass csv_writer_options: - csv_writer_options() except + - - cudf_io_types.sink_info get_sink() except + - cudf_table_view.table_view get_table() except + - cudf_io_types.table_metadata get_metadata() except + - string get_na_rep() except + - bool is_enabled_include_header() except + - size_type get_rows_per_chunk() except + - string get_line_terminator() except + - char get_inter_column_delimiter() except + - string get_true_value() except + - string get_false_value() except + - vector[string] get_names() except + + csv_writer_options() except +libcudf_exception_handler + + cudf_io_types.sink_info get_sink() except +libcudf_exception_handler + cudf_table_view.table_view get_table() except +libcudf_exception_handler + cudf_io_types.table_metadata get_metadata() except +libcudf_exception_handler + string get_na_rep() except +libcudf_exception_handler + bool is_enabled_include_header() except +libcudf_exception_handler + size_type get_rows_per_chunk() except +libcudf_exception_handler + string get_line_terminator() except +libcudf_exception_handler + char get_inter_column_delimiter() except +libcudf_exception_handler + string get_true_value() except +libcudf_exception_handler + string get_false_value() except +libcudf_exception_handler + vector[string] get_names() except +libcudf_exception_handler # setter - void set_metadata(cudf_io_types.table_metadata* val) except + - void set_na_rep(string val) except + - void enable_include_header(bool val) except + - void set_rows_per_chunk(size_type val) except + - void set_line_terminator(string term) except + - void set_inter_column_delimiter(char delim) except + - void set_true_value(string val) except + - void set_false_value(string val) except + - void set_names(vector[string] val) except + + void set_metadata( + cudf_io_types.table_metadata* val + ) except +libcudf_exception_handler + void set_na_rep(string val) except +libcudf_exception_handler + void enable_include_header(bool val) except +libcudf_exception_handler + void set_rows_per_chunk(size_type val) except +libcudf_exception_handler + void set_line_terminator(string term) except +libcudf_exception_handler + void set_inter_column_delimiter(char delim) except +libcudf_exception_handler + void set_true_value(string val) except +libcudf_exception_handler + void set_false_value(string val) except +libcudf_exception_handler + void set_names(vector[string] val) except +libcudf_exception_handler @staticmethod csv_writer_options_builder builder( cudf_io_types.sink_info sink, cudf_table_view.table_view table - ) except + + ) except +libcudf_exception_handler cdef cppclass csv_writer_options_builder: - csv_writer_options_builder() except + + csv_writer_options_builder() except +libcudf_exception_handler csv_writer_options_builder( cudf_io_types.sink_info sink, cudf_table_view.table_view table - ) except + + ) except +libcudf_exception_handler - csv_writer_options_builder& names(vector[string] val) except + - csv_writer_options_builder& na_rep(string val) except + - csv_writer_options_builder& include_header(bool val) except + - csv_writer_options_builder& rows_per_chunk(size_type val) except + - csv_writer_options_builder& line_terminator(string term) except + - csv_writer_options_builder& inter_column_delimiter(char delim) except + - csv_writer_options_builder& true_value(string val) except + - csv_writer_options_builder& false_value(string val) except + + csv_writer_options_builder& names( + vector[string] val + ) except +libcudf_exception_handler + csv_writer_options_builder& na_rep( + string val + ) except +libcudf_exception_handler + csv_writer_options_builder& include_header( + bool val + ) except +libcudf_exception_handler + csv_writer_options_builder& rows_per_chunk( + size_type val + ) except +libcudf_exception_handler + csv_writer_options_builder& line_terminator( + string term + ) except +libcudf_exception_handler + csv_writer_options_builder& inter_column_delimiter( + char delim + ) except +libcudf_exception_handler + csv_writer_options_builder& true_value( + string val + ) except +libcudf_exception_handler + csv_writer_options_builder& false_value( + string val + ) except +libcudf_exception_handler - csv_writer_options build() except + + csv_writer_options build() except +libcudf_exception_handler - cdef void write_csv(csv_writer_options args) except + + cdef void write_csv(csv_writer_options args) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/data_sink.pxd b/python/pylibcudf/pylibcudf/libcudf/io/data_sink.pxd index e939a47d7f9..00f35bbf4e4 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/data_sink.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/data_sink.pxd @@ -1,4 +1,5 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +from pylibcudf.exception_handler cimport libcudf_exception_handler cdef extern from "cudf/io/data_sink.hpp" \ diff --git a/python/pylibcudf/pylibcudf/libcudf/io/datasource.pxd b/python/pylibcudf/pylibcudf/libcudf/io/datasource.pxd index c69aa65bd3c..cda7d940c91 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/datasource.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/datasource.pxd @@ -1,4 +1,5 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +from pylibcudf.exception_handler cimport libcudf_exception_handler cdef extern from "cudf/io/datasource.hpp" \ diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd index 1c74f8ca3ac..a7ca6978621 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd @@ -1,5 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.io.types as cudf_io_types cimport pylibcudf.libcudf.table.table_view as cudf_table_view from libc.stdint cimport int32_t, uint8_t @@ -8,6 +7,7 @@ from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport data_type, size_type @@ -23,133 +23,154 @@ cdef extern from "cudf/io/json.hpp" \ RECOVER_WITH_NULL cdef cppclass json_reader_options: - json_reader_options() except + - cudf_io_types.source_info get_source() except + - vector[string] get_dtypes() except + - cudf_io_types.compression_type get_compression() except + - size_t get_byte_range_offset() except + - size_t get_byte_range_size() except + - bool is_enabled_lines() except + - bool is_enabled_mixed_types_as_string() except + - bool is_enabled_prune_columns() except + - bool is_enabled_dayfirst() except + - bool is_enabled_experimental() except + + json_reader_options() except +libcudf_exception_handler + cudf_io_types.source_info get_source() except +libcudf_exception_handler + vector[string] get_dtypes() except +libcudf_exception_handler + cudf_io_types.compression_type get_compression()\ + except +libcudf_exception_handler + size_t get_byte_range_offset() except +libcudf_exception_handler + size_t get_byte_range_size() except +libcudf_exception_handler + bool is_enabled_lines() except +libcudf_exception_handler + bool is_enabled_mixed_types_as_string() except +libcudf_exception_handler + bool is_enabled_prune_columns() except +libcudf_exception_handler + bool is_enabled_dayfirst() except +libcudf_exception_handler + bool is_enabled_experimental() except +libcudf_exception_handler # setter - void set_dtypes(vector[data_type] types) except + - void set_dtypes(map[string, schema_element] types) except + + void set_dtypes( + vector[data_type] types + ) except +libcudf_exception_handler + void set_dtypes( + map[string, schema_element] types + ) except +libcudf_exception_handler void set_compression( cudf_io_types.compression_type compression - ) except + - void set_byte_range_offset(size_t offset) except + - void set_byte_range_size(size_t size) except + - void enable_lines(bool val) except + - void enable_mixed_types_as_string(bool val) except + - void enable_prune_columns(bool val) except + - void enable_dayfirst(bool val) except + - void enable_experimental(bool val) except + - void enable_keep_quotes(bool val) except + + ) except +libcudf_exception_handler + void set_byte_range_offset(size_t offset) except +libcudf_exception_handler + void set_byte_range_size(size_t size) except +libcudf_exception_handler + void enable_lines(bool val) except +libcudf_exception_handler + void enable_mixed_types_as_string(bool val) except +libcudf_exception_handler + void enable_prune_columns(bool val) except +libcudf_exception_handler + void enable_dayfirst(bool val) except +libcudf_exception_handler + void enable_experimental(bool val) except +libcudf_exception_handler + void enable_keep_quotes(bool val) except +libcudf_exception_handler @staticmethod json_reader_options_builder builder( cudf_io_types.source_info src - ) except + + ) except +libcudf_exception_handler cdef cppclass json_reader_options_builder: - json_reader_options_builder() except + + json_reader_options_builder() except +libcudf_exception_handler json_reader_options_builder( cudf_io_types.source_info src - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& dtypes( vector[string] types - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& dtypes( vector[data_type] types - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& dtypes( map[string, schema_element] types - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& compression( cudf_io_types.compression_type compression - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& byte_range_offset( size_t offset - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& byte_range_size( size_t size - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& lines( bool val - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& mixed_types_as_string( bool val - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& prune_columns( bool val - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& dayfirst( bool val - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& keep_quotes( bool val - ) except + + ) except +libcudf_exception_handler json_reader_options_builder& recovery_mode( json_recovery_mode_t val - ) except + + ) except +libcudf_exception_handler - json_reader_options build() except + + json_reader_options build() except +libcudf_exception_handler cdef cudf_io_types.table_with_metadata read_json( - json_reader_options &options) except + + json_reader_options &options) except +libcudf_exception_handler cdef cppclass json_writer_options: - json_writer_options() except + - cudf_io_types.sink_info get_sink() except + - cudf_table_view.table_view get_table() except + - string get_na_rep() except + - bool is_enabled_include_nulls() except + - bool is_enabled_lines() except + - bool is_enabled_experimental() except + - size_type get_rows_per_chunk() except + - string get_true_value() except + - string get_false_value() except + + json_writer_options() except +libcudf_exception_handler + cudf_io_types.sink_info get_sink() except +libcudf_exception_handler + cudf_table_view.table_view get_table() except +libcudf_exception_handler + string get_na_rep() except +libcudf_exception_handler + bool is_enabled_include_nulls() except +libcudf_exception_handler + bool is_enabled_lines() except +libcudf_exception_handler + bool is_enabled_experimental() except +libcudf_exception_handler + size_type get_rows_per_chunk() except +libcudf_exception_handler + string get_true_value() except +libcudf_exception_handler + string get_false_value() except +libcudf_exception_handler # setter - void set_table(cudf_table_view.table_view tbl) except + - void set_metadata(cudf_io_types.table_metadata meta) except + - void set_na_rep(string val) except + - void enable_include_nulls(bool val) except + - void enable_lines(bool val) except + - void set_rows_per_chunk(size_type val) except + - void set_true_value(string val) except + - void set_false_value(string val) except + + void set_table( + cudf_table_view.table_view tbl + ) except +libcudf_exception_handler + void set_metadata( + cudf_io_types.table_metadata meta + ) except +libcudf_exception_handler + void set_na_rep(string val) except +libcudf_exception_handler + void enable_include_nulls(bool val) except +libcudf_exception_handler + void enable_lines(bool val) except +libcudf_exception_handler + void set_rows_per_chunk(size_type val) except +libcudf_exception_handler + void set_true_value(string val) except +libcudf_exception_handler + void set_false_value(string val) except +libcudf_exception_handler @staticmethod json_writer_options_builder builder( cudf_io_types.sink_info sink, cudf_table_view.table_view tbl - ) except + + ) except +libcudf_exception_handler cdef cppclass json_writer_options_builder: - json_writer_options_builder() except + + json_writer_options_builder() except +libcudf_exception_handler json_writer_options_builder( cudf_io_types.source_info src, cudf_table_view.table_view tbl - ) except + + ) except +libcudf_exception_handler json_writer_options_builder& table( cudf_table_view.table_view tbl - ) except + + ) except +libcudf_exception_handler json_writer_options_builder& metadata( cudf_io_types.table_metadata meta - ) except + - json_writer_options_builder& na_rep(string val) except + - json_writer_options_builder& include_nulls(bool val) except + - json_writer_options_builder& lines(bool val) except + - json_writer_options_builder& rows_per_chunk(size_type val) except + - json_writer_options_builder& true_value(string val) except + - json_writer_options_builder& false_value(string val) except + - - json_writer_options build() except + + ) except +libcudf_exception_handler + json_writer_options_builder& na_rep( + string val + ) except +libcudf_exception_handler + json_writer_options_builder& include_nulls( + bool val + ) except +libcudf_exception_handler + json_writer_options_builder& lines( + bool val + ) except +libcudf_exception_handler + json_writer_options_builder& rows_per_chunk( + size_type val + ) except +libcudf_exception_handler + json_writer_options_builder& true_value( + string val + ) except +libcudf_exception_handler + json_writer_options_builder& false_value( + string val + ) except +libcudf_exception_handler + + json_writer_options build() except +libcudf_exception_handler cdef cudf_io_types.table_with_metadata write_json( - json_writer_options &options) except + + json_writer_options &options) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd index dca24c7f665..f5485da1d51 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd @@ -1,5 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.io.types as cudf_io_types cimport pylibcudf.libcudf.table.table_view as cudf_table_view from libc.stdint cimport int64_t, uint8_t @@ -9,6 +8,7 @@ from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport data_type, size_type @@ -16,160 +16,206 @@ cdef extern from "cudf/io/orc.hpp" \ namespace "cudf::io" nogil: cdef cppclass orc_reader_options: - orc_reader_options() except + - - cudf_io_types.source_info get_source() except + - vector[vector[size_type]] get_stripes() except + - int64_t get_skip_rows() except + - optional[int64_t] get_num_rows() except + - bool is_enabled_use_index() except + - bool is_enabled_use_np_dtypes() except + - data_type get_timestamp_type() except + - bool is_enabled_decimals_as_float64() except + - int get_forced_decimals_scale() except + - - void set_columns(vector[string] col_names) except + - void set_stripes(vector[vector[size_type]] strps) except + - void set_skip_rows(int64_t rows) except + - void set_num_rows(int64_t nrows) except + - void enable_use_index(bool val) except + - void enable_use_np_dtypes(bool val) except + - void set_timestamp_type(data_type type) except + - void set_decimal128_columns(vector[string] val) except + + orc_reader_options() except +libcudf_exception_handler + + cudf_io_types.source_info get_source() except +libcudf_exception_handler + vector[vector[size_type]] get_stripes() except +libcudf_exception_handler + int64_t get_skip_rows() except +libcudf_exception_handler + optional[int64_t] get_num_rows() except +libcudf_exception_handler + bool is_enabled_use_index() except +libcudf_exception_handler + bool is_enabled_use_np_dtypes() except +libcudf_exception_handler + data_type get_timestamp_type() except +libcudf_exception_handler + bool is_enabled_decimals_as_float64() except +libcudf_exception_handler + int get_forced_decimals_scale() except +libcudf_exception_handler + + void set_columns(vector[string] col_names) except +libcudf_exception_handler + void set_stripes( + vector[vector[size_type]] strps + ) except +libcudf_exception_handler + void set_skip_rows(int64_t rows) except +libcudf_exception_handler + void set_num_rows(int64_t nrows) except +libcudf_exception_handler + void enable_use_index(bool val) except +libcudf_exception_handler + void enable_use_np_dtypes(bool val) except +libcudf_exception_handler + void set_timestamp_type(data_type type) except +libcudf_exception_handler + void set_decimal128_columns( + vector[string] val + ) except +libcudf_exception_handler @staticmethod orc_reader_options_builder builder( cudf_io_types.source_info src - ) except + + ) except +libcudf_exception_handler cdef cppclass orc_reader_options_builder: - orc_reader_options_builder() except + - orc_reader_options_builder(cudf_io_types.source_info &src) except + - - orc_reader_options_builder& columns(vector[string] col_names) except + + orc_reader_options_builder() except +libcudf_exception_handler + orc_reader_options_builder( + cudf_io_types.source_info &src + ) except +libcudf_exception_handler + + orc_reader_options_builder& columns( + vector[string] col_names + ) except +libcudf_exception_handler orc_reader_options_builder& \ - stripes(vector[vector[size_type]] strps) except + - orc_reader_options_builder& skip_rows(int64_t rows) except + - orc_reader_options_builder& num_rows(int64_t nrows) except + - orc_reader_options_builder& use_index(bool val) except + - orc_reader_options_builder& use_np_dtypes(bool val) except + - orc_reader_options_builder& timestamp_type(data_type type) except + - - orc_reader_options build() except + + stripes(vector[vector[size_type]] strps) except +libcudf_exception_handler + orc_reader_options_builder& skip_rows( + int64_t rows + ) except +libcudf_exception_handler + orc_reader_options_builder& num_rows( + int64_t nrows + ) except +libcudf_exception_handler + orc_reader_options_builder& use_index( + bool val + ) except +libcudf_exception_handler + orc_reader_options_builder& use_np_dtypes( + bool val + ) except +libcudf_exception_handler + orc_reader_options_builder& timestamp_type( + data_type type + ) except +libcudf_exception_handler + + orc_reader_options build() except +libcudf_exception_handler cdef cudf_io_types.table_with_metadata read_orc( orc_reader_options opts - ) except + + ) except +libcudf_exception_handler cdef cppclass orc_writer_options: orc_writer_options() - cudf_io_types.sink_info get_sink() except + - cudf_io_types.compression_type get_compression() except + - bool is_enabled_statistics() except + - size_t get_stripe_size_bytes() except + - size_type get_stripe_size_rows() except + - size_type get_row_index_stride() except + - cudf_table_view.table_view get_table() except + + cudf_io_types.sink_info get_sink() except +libcudf_exception_handler + cudf_io_types.compression_type get_compression()\ + except +libcudf_exception_handler + bool is_enabled_statistics() except +libcudf_exception_handler + size_t get_stripe_size_bytes() except +libcudf_exception_handler + size_type get_stripe_size_rows() except +libcudf_exception_handler + size_type get_row_index_stride() except +libcudf_exception_handler + cudf_table_view.table_view get_table() except +libcudf_exception_handler const optional[cudf_io_types.table_input_metadata]& get_metadata( - ) except + + ) except +libcudf_exception_handler # setter - void set_compression(cudf_io_types.compression_type comp) except + - void enable_statistics(bool val) except + - void set_stripe_size_bytes(size_t val) except + - void set_stripe_size_rows(size_type val) except + - void set_row_index_stride(size_type val) except + - void set_table(cudf_table_view.table_view tbl) except + - void set_metadata(cudf_io_types.table_input_metadata meta) except + - void set_key_value_metadata(map[string, string] kvm) except + + void set_compression( + cudf_io_types.compression_type comp + ) except +libcudf_exception_handler + void enable_statistics(bool val) except +libcudf_exception_handler + void set_stripe_size_bytes(size_t val) except +libcudf_exception_handler + void set_stripe_size_rows(size_type val) except +libcudf_exception_handler + void set_row_index_stride(size_type val) except +libcudf_exception_handler + void set_table(cudf_table_view.table_view tbl) except +libcudf_exception_handler + void set_metadata( + cudf_io_types.table_input_metadata meta + ) except +libcudf_exception_handler + void set_key_value_metadata( + map[string, string] kvm + ) except +libcudf_exception_handler @staticmethod orc_writer_options_builder builder( cudf_io_types.sink_info &sink, cudf_table_view.table_view &tbl - ) except + + ) except +libcudf_exception_handler cdef cppclass orc_writer_options_builder: # setter orc_writer_options_builder& compression( cudf_io_types.compression_type comp - ) except + + ) except +libcudf_exception_handler orc_writer_options_builder& enable_statistics( cudf_io_types.statistics_freq val - ) except + - orc_writer_options_builder& stripe_size_bytes(size_t val) except + - orc_writer_options_builder& stripe_size_rows(size_type val) except + - orc_writer_options_builder& row_index_stride(size_type val) except + + ) except +libcudf_exception_handler + orc_writer_options_builder& stripe_size_bytes( + size_t val + ) except +libcudf_exception_handler + orc_writer_options_builder& stripe_size_rows( + size_type val + ) except +libcudf_exception_handler + orc_writer_options_builder& row_index_stride( + size_type val + ) except +libcudf_exception_handler orc_writer_options_builder& table( cudf_table_view.table_view tbl - ) except + + ) except +libcudf_exception_handler orc_writer_options_builder& metadata( cudf_io_types.table_input_metadata meta - ) except + + ) except +libcudf_exception_handler orc_writer_options_builder& key_value_metadata( map[string, string] kvm - ) except + + ) except +libcudf_exception_handler - orc_writer_options build() except + + orc_writer_options build() except +libcudf_exception_handler - cdef void write_orc(orc_writer_options options) except + + cdef void write_orc( + orc_writer_options options + ) except +libcudf_exception_handler cdef cppclass chunked_orc_writer_options: - chunked_orc_writer_options() except + - cudf_io_types.sink_info get_sink() except + - cudf_io_types.compression_type get_compression() except + - bool enable_statistics() except + - size_t stripe_size_bytes() except + - size_type stripe_size_rows() except + - size_type row_index_stride() except + - cudf_table_view.table_view get_table() except + + chunked_orc_writer_options() except +libcudf_exception_handler + cudf_io_types.sink_info get_sink() except +libcudf_exception_handler + cudf_io_types.compression_type get_compression()\ + except +libcudf_exception_handler + bool enable_statistics() except +libcudf_exception_handler + size_t stripe_size_bytes() except +libcudf_exception_handler + size_type stripe_size_rows() except +libcudf_exception_handler + size_type row_index_stride() except +libcudf_exception_handler + cudf_table_view.table_view get_table() except +libcudf_exception_handler const optional[cudf_io_types.table_input_metadata]& get_metadata( - ) except + + ) except +libcudf_exception_handler # setter - void set_compression(cudf_io_types.compression_type comp) except + - void enable_statistics(bool val) except + - void set_stripe_size_bytes(size_t val) except + - void set_stripe_size_rows(size_type val) except + - void set_row_index_stride(size_type val) except + - void set_table(cudf_table_view.table_view tbl) except + + void set_compression( + cudf_io_types.compression_type comp + ) except +libcudf_exception_handler + void enable_statistics(bool val) except +libcudf_exception_handler + void set_stripe_size_bytes(size_t val) except +libcudf_exception_handler + void set_stripe_size_rows(size_type val) except +libcudf_exception_handler + void set_row_index_stride(size_type val) except +libcudf_exception_handler + void set_table(cudf_table_view.table_view tbl) except +libcudf_exception_handler void set_metadata( cudf_io_types.table_input_metadata meta - ) except + - void set_key_value_metadata(map[string, string] kvm) except + + ) except +libcudf_exception_handler + void set_key_value_metadata( + map[string, string] kvm + ) except +libcudf_exception_handler @staticmethod chunked_orc_writer_options_builder builder( cudf_io_types.sink_info &sink - ) except + + ) except +libcudf_exception_handler cdef cppclass chunked_orc_writer_options_builder: # setter chunked_orc_writer_options_builder& compression( cudf_io_types.compression_type comp - ) except + + ) except +libcudf_exception_handler chunked_orc_writer_options_builder& enable_statistics( cudf_io_types.statistics_freq val - ) except + - orc_writer_options_builder& stripe_size_bytes(size_t val) except + - orc_writer_options_builder& stripe_size_rows(size_type val) except + - orc_writer_options_builder& row_index_stride(size_type val) except + + ) except +libcudf_exception_handler + orc_writer_options_builder& stripe_size_bytes( + size_t val + ) except +libcudf_exception_handler + orc_writer_options_builder& stripe_size_rows( + size_type val + ) except +libcudf_exception_handler + orc_writer_options_builder& row_index_stride( + size_type val + ) except +libcudf_exception_handler chunked_orc_writer_options_builder& table( cudf_table_view.table_view tbl - ) except + + ) except +libcudf_exception_handler chunked_orc_writer_options_builder& metadata( cudf_io_types.table_input_metadata meta - ) except + + ) except +libcudf_exception_handler chunked_orc_writer_options_builder& key_value_metadata( map[string, string] kvm - ) except + + ) except +libcudf_exception_handler - chunked_orc_writer_options build() except + + chunked_orc_writer_options build() except +libcudf_exception_handler cdef cppclass orc_chunked_writer: - orc_chunked_writer() except + - orc_chunked_writer(chunked_orc_writer_options args) except + + orc_chunked_writer() except +libcudf_exception_handler + orc_chunked_writer( + chunked_orc_writer_options args + ) except +libcudf_exception_handler orc_chunked_writer& write( cudf_table_view.table_view table_, - ) except + - void close() except + + ) except +libcudf_exception_handler + void close() except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd index 9302ffe2f80..38954d22676 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd @@ -1,10 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t from libcpp cimport bool from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.io cimport types as cudf_io_types from pylibcudf.variant cimport monostate, variant @@ -69,4 +69,4 @@ cdef extern from "cudf/io/orc_metadata.hpp" \ cdef parsed_orc_statistics read_parsed_orc_statistics( cudf_io_types.source_info src_info - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd index de6a6c1e82d..110c9d4a0b9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd @@ -1,5 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int64_t, uint8_t from libcpp cimport bool from libcpp.functional cimport reference_wrapper @@ -8,6 +7,7 @@ from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.expressions cimport expression from pylibcudf.libcudf.io.types cimport ( compression_type, @@ -25,232 +25,241 @@ from pylibcudf.libcudf.types cimport data_type, size_type cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cdef cppclass parquet_reader_options: - parquet_reader_options() except + - source_info get_source_info() except + - vector[vector[size_type]] get_row_groups() except + - const optional[reference_wrapper[expression]]& get_filter() except + - data_type get_timestamp_type() except + - bool is_enabled_use_pandas_metadata() except + - bool is_enabled_arrow_schema() except + - bool is_enabled_allow_mismatched_pq_schemas() except + + parquet_reader_options() except +libcudf_exception_handler + source_info get_source_info() except +libcudf_exception_handler + vector[vector[size_type]] get_row_groups() except +libcudf_exception_handler + const optional[reference_wrapper[expression]]& get_filter()\ + except +libcudf_exception_handler + data_type get_timestamp_type() except +libcudf_exception_handler + bool is_enabled_use_pandas_metadata() except +libcudf_exception_handler + bool is_enabled_arrow_schema() except +libcudf_exception_handler + bool is_enabled_allow_mismatched_pq_schemas() except +libcudf_exception_handler # setter - void set_filter(expression &filter) except + - void set_columns(vector[string] col_names) except + - void set_num_rows(size_type val) except + - void set_row_groups(vector[vector[size_type]] row_grp) except + - void set_skip_rows(int64_t val) except + - void enable_use_arrow_schema(bool val) except + - void enable_allow_mismatched_pq_schemas(bool val) except + - void enable_use_pandas_metadata(bool val) except + - void set_timestamp_type(data_type type) except + + void set_filter(expression &filter) except +libcudf_exception_handler + void set_columns(vector[string] col_names) except +libcudf_exception_handler + void set_num_rows(size_type val) except +libcudf_exception_handler + void set_row_groups( + vector[vector[size_type]] row_grp + ) except +libcudf_exception_handler + void set_skip_rows(int64_t val) except +libcudf_exception_handler + void enable_use_arrow_schema(bool val) except +libcudf_exception_handler + void enable_allow_mismatched_pq_schemas( + bool val + ) except +libcudf_exception_handler + void enable_use_pandas_metadata(bool val) except +libcudf_exception_handler + void set_timestamp_type(data_type type) except +libcudf_exception_handler @staticmethod parquet_reader_options_builder builder( source_info src - ) except + + ) except +libcudf_exception_handler cdef cppclass parquet_reader_options_builder: - parquet_reader_options_builder() except + + parquet_reader_options_builder() except +libcudf_exception_handler parquet_reader_options_builder( source_info src - ) except + + ) except +libcudf_exception_handler parquet_reader_options_builder& columns( vector[string] col_names - ) except + + ) except +libcudf_exception_handler parquet_reader_options_builder& row_groups( vector[vector[size_type]] row_grp - ) except + + ) except +libcudf_exception_handler parquet_reader_options_builder& convert_strings_to_categories( bool val - ) except + + ) except +libcudf_exception_handler parquet_reader_options_builder& use_pandas_metadata( bool val - ) except + + ) except +libcudf_exception_handler parquet_reader_options_builder& use_arrow_schema( bool val - ) except + + ) except +libcudf_exception_handler parquet_reader_options_builder& allow_mismatched_pq_schemas( bool val - ) except + + ) except +libcudf_exception_handler parquet_reader_options_builder& timestamp_type( data_type type - ) except + + ) except +libcudf_exception_handler parquet_reader_options_builder& filter( const expression & f - ) except + - parquet_reader_options build() except + + ) except +libcudf_exception_handler + parquet_reader_options build() except +libcudf_exception_handler cdef table_with_metadata read_parquet( - parquet_reader_options args) except + + parquet_reader_options args) except +libcudf_exception_handler cdef cppclass parquet_writer_options_base: - parquet_writer_options_base() except + - sink_info get_sink_info() except + - compression_type get_compression() except + - statistics_freq get_stats_level() except + + parquet_writer_options_base() except +libcudf_exception_handler + sink_info get_sink_info() except +libcudf_exception_handler + compression_type get_compression() except +libcudf_exception_handler + statistics_freq get_stats_level() except +libcudf_exception_handler const optional[table_input_metadata]& get_metadata( - ) except + - size_t get_row_group_size_bytes() except + - size_type get_row_group_size_rows() except + - size_t get_max_page_size_bytes() except + - size_type get_max_page_size_rows() except + - size_t get_max_dictionary_size() except + - bool is_enabled_write_arrow_schema() except + + ) except +libcudf_exception_handler + size_t get_row_group_size_bytes() except +libcudf_exception_handler + size_type get_row_group_size_rows() except +libcudf_exception_handler + size_t get_max_page_size_bytes() except +libcudf_exception_handler + size_type get_max_page_size_rows() except +libcudf_exception_handler + size_t get_max_dictionary_size() except +libcudf_exception_handler + bool is_enabled_write_arrow_schema() except +libcudf_exception_handler void set_metadata( table_input_metadata m - ) except + + ) except +libcudf_exception_handler void set_key_value_metadata( vector[map[string, string]] kvm - ) except + + ) except +libcudf_exception_handler void set_stats_level( statistics_freq sf - ) except + + ) except +libcudf_exception_handler void set_compression( compression_type compression - ) except + + ) except +libcudf_exception_handler void set_int96_timestamps( bool enabled - ) except + + ) except +libcudf_exception_handler void set_utc_timestamps( bool enabled - ) except + - void set_row_group_size_bytes(size_t val) except + - void set_row_group_size_rows(size_type val) except + - void set_max_page_size_bytes(size_t val) except + - void set_max_page_size_rows(size_type val) except + - void set_max_dictionary_size(size_t val) except + - void enable_write_v2_headers(bool val) except + - void enable_write_arrow_schema(bool val) except + - void set_dictionary_policy(dictionary_policy policy) except + + ) except +libcudf_exception_handler + void set_row_group_size_bytes(size_t val) except +libcudf_exception_handler + void set_row_group_size_rows(size_type val) except +libcudf_exception_handler + void set_max_page_size_bytes(size_t val) except +libcudf_exception_handler + void set_max_page_size_rows(size_type val) except +libcudf_exception_handler + void set_max_dictionary_size(size_t val) except +libcudf_exception_handler + void enable_write_v2_headers(bool val) except +libcudf_exception_handler + void enable_write_arrow_schema(bool val) except +libcudf_exception_handler + void set_dictionary_policy( + dictionary_policy policy + ) except +libcudf_exception_handler cdef cppclass parquet_writer_options(parquet_writer_options_base): - parquet_writer_options() except + - table_view get_table() except + - string get_column_chunks_file_paths() except + + parquet_writer_options() except +libcudf_exception_handler + table_view get_table() except +libcudf_exception_handler + string get_column_chunks_file_paths() except +libcudf_exception_handler void set_partitions( vector[partition_info] partitions - ) except + + ) except +libcudf_exception_handler void set_column_chunks_file_paths( vector[string] column_chunks_file_paths - ) except + + ) except +libcudf_exception_handler @staticmethod parquet_writer_options_builder builder( sink_info sink_, table_view table_ - ) except + + ) except +libcudf_exception_handler cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]: - parquet_writer_options_builder_base() except + + parquet_writer_options_builder_base() except +libcudf_exception_handler BuilderT& metadata( table_input_metadata m - ) except + + ) except +libcudf_exception_handler BuilderT& key_value_metadata( vector[map[string, string]] kvm - ) except + + ) except +libcudf_exception_handler BuilderT& stats_level( statistics_freq sf - ) except + + ) except +libcudf_exception_handler BuilderT& compression( compression_type compression - ) except + + ) except +libcudf_exception_handler BuilderT& int96_timestamps( bool enabled - ) except + + ) except +libcudf_exception_handler BuilderT& utc_timestamps( bool enabled - ) except + + ) except +libcudf_exception_handler BuilderT& write_arrow_schema( bool enabled - ) except + + ) except +libcudf_exception_handler BuilderT& row_group_size_bytes( size_t val - ) except + + ) except +libcudf_exception_handler BuilderT& row_group_size_rows( size_type val - ) except + + ) except +libcudf_exception_handler BuilderT& max_page_size_bytes( size_t val - ) except + + ) except +libcudf_exception_handler BuilderT& max_page_size_rows( size_type val - ) except + + ) except +libcudf_exception_handler BuilderT& max_dictionary_size( size_t val - ) except + + ) except +libcudf_exception_handler BuilderT& write_v2_headers( bool val - ) except + + ) except +libcudf_exception_handler BuilderT& dictionary_policy( dictionary_policy val - ) except + - OptionsT build() except + + ) except +libcudf_exception_handler + OptionsT build() except +libcudf_exception_handler cdef cppclass parquet_writer_options_builder( parquet_writer_options_builder_base[parquet_writer_options_builder, parquet_writer_options]): - parquet_writer_options_builder() except + + parquet_writer_options_builder() except +libcudf_exception_handler parquet_writer_options_builder( sink_info sink_, table_view table_ - ) except + + ) except +libcudf_exception_handler parquet_writer_options_builder& partitions( vector[partition_info] partitions - ) except + + ) except +libcudf_exception_handler parquet_writer_options_builder& column_chunks_file_paths( vector[string] column_chunks_file_paths - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[vector[uint8_t]] write_parquet( parquet_writer_options args - ) except + + ) except +libcudf_exception_handler cdef cppclass chunked_parquet_writer_options(parquet_writer_options_base): - chunked_parquet_writer_options() except + + chunked_parquet_writer_options() except +libcudf_exception_handler @staticmethod chunked_parquet_writer_options_builder builder( sink_info sink_, - ) except + + ) except +libcudf_exception_handler cdef cppclass chunked_parquet_writer_options_builder( parquet_writer_options_builder_base[chunked_parquet_writer_options_builder, chunked_parquet_writer_options] ): - chunked_parquet_writer_options_builder() except + + chunked_parquet_writer_options_builder() except +libcudf_exception_handler chunked_parquet_writer_options_builder( sink_info sink_, - ) except + + ) except +libcudf_exception_handler cdef cppclass parquet_chunked_writer: - parquet_chunked_writer() except + - parquet_chunked_writer(chunked_parquet_writer_options args) except + + parquet_chunked_writer() except +libcudf_exception_handler + parquet_chunked_writer( + chunked_parquet_writer_options args + ) except +libcudf_exception_handler parquet_chunked_writer& write( table_view table_, - ) except + + ) except +libcudf_exception_handler parquet_chunked_writer& write( const table_view& table_, const vector[partition_info]& partitions, - ) except + + ) except +libcudf_exception_handler unique_ptr[vector[uint8_t]] close( vector[string] column_chunks_file_paths, - ) except + + ) except +libcudf_exception_handler cdef cppclass chunked_parquet_reader: - chunked_parquet_reader() except + + chunked_parquet_reader() except +libcudf_exception_handler chunked_parquet_reader( size_t chunk_read_limit, - const parquet_reader_options& options) except + + const parquet_reader_options& options) except +libcudf_exception_handler chunked_parquet_reader( size_t chunk_read_limit, size_t pass_read_limit, - const parquet_reader_options& options) except + - bool has_next() except + - table_with_metadata read_chunk() except + + const parquet_reader_options& options) except +libcudf_exception_handler + bool has_next() except +libcudf_exception_handler + table_with_metadata read_chunk() except +libcudf_exception_handler cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata( const vector[unique_ptr[vector[uint8_t]]]& metadata_list - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd index 8e6da56c9a6..cdc87093f3a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd @@ -1,31 +1,34 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -cimport pylibcudf.libcudf.io.types as cudf_io_types from libc.stdint cimport int64_t from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.io.types cimport source_info cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil: cdef cppclass parquet_column_schema: - parquet_column_schema() except+ - string name() except+ - size_type num_children() except+ - parquet_column_schema child(int idx) except+ - vector[parquet_column_schema] children() except+ + parquet_column_schema() except +libcudf_exception_handler + string name() except +libcudf_exception_handler + size_type num_children() except +libcudf_exception_handler + parquet_column_schema child(int idx) except +libcudf_exception_handler + vector[parquet_column_schema] children() except +libcudf_exception_handler cdef cppclass parquet_schema: - parquet_schema() except+ - parquet_column_schema root() except+ + parquet_schema() except +libcudf_exception_handler + parquet_column_schema root() except +libcudf_exception_handler cdef cppclass parquet_metadata: - parquet_metadata() except+ - parquet_schema schema() except+ - int64_t num_rows() except+ - size_type num_rowgroups() except+ - unordered_map[string, string] metadata() except+ - vector[unordered_map[string, int64_t]] rowgroup_metadata() except+ + parquet_metadata() except +libcudf_exception_handler + parquet_schema schema() except +libcudf_exception_handler + int64_t num_rows() except +libcudf_exception_handler + size_type num_rowgroups() except +libcudf_exception_handler + unordered_map[string, string] metadata() except +libcudf_exception_handler + vector[unordered_map[string, int64_t]] rowgroup_metadata()\ + except +libcudf_exception_handler - cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+ + cdef parquet_metadata read_parquet_metadata( + source_info src_info + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/text.pxd b/python/pylibcudf/pylibcudf/libcudf/io/text.pxd index 14397ef970d..b49fede21b3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/text.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/text.pxd @@ -1,9 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport uint64_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column @@ -11,27 +11,37 @@ cdef extern from "cudf/io/text/byte_range_info.hpp" \ namespace "cudf::io::text" nogil: cdef cppclass byte_range_info: - byte_range_info() except + - byte_range_info(size_t offset, size_t size) except + + byte_range_info() except +libcudf_exception_handler + byte_range_info( + size_t offset, size_t size + ) except +libcudf_exception_handler cdef extern from "cudf/io/text/data_chunk_source.hpp" \ namespace "cudf::io::text" nogil: cdef cppclass data_chunk_source: - data_chunk_source() except + + data_chunk_source() except +libcudf_exception_handler cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \ namespace "cudf::io::text" nogil: - unique_ptr[data_chunk_source] make_source(string data) except + + unique_ptr[data_chunk_source] make_source( + string data + ) except +libcudf_exception_handler unique_ptr[data_chunk_source] \ - make_source_from_file(string filename) except + + make_source_from_file( + string filename + ) except +libcudf_exception_handler unique_ptr[data_chunk_source] \ - make_source_from_bgzip_file(string filename) except + + make_source_from_bgzip_file( + string filename + ) except +libcudf_exception_handler unique_ptr[data_chunk_source] \ - make_source_from_bgzip_file(string filename, - uint64_t virtual_begin, - uint64_t virtual_end) except + + make_source_from_bgzip_file( + string filename, + uint64_t virtual_begin, + uint64_t virtual_end + ) except +libcudf_exception_handler cdef extern from "cudf/io/text/multibyte_split.hpp" \ @@ -41,8 +51,10 @@ cdef extern from "cudf/io/text/multibyte_split.hpp" \ byte_range_info byte_range bool strip_delimiters - parse_options() except + + parse_options() except +libcudf_exception_handler - unique_ptr[column] multibyte_split(data_chunk_source source, - string delimiter, - parse_options options) except + + unique_ptr[column] multibyte_split( + data_chunk_source source, + string delimiter, + parse_options options + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd index 676901efcec..b59692ebdac 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd @@ -1,9 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.optional cimport optional from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.table.table cimport table @@ -11,4 +11,4 @@ cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil: unique_ptr[table] make_timezone_transition_table( optional[string] tzif_dir, string timezone_name - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd index 5f3be2f0727..e02cb79e10d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd @@ -1,5 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink cimport pylibcudf.libcudf.io.datasource as cudf_io_datasource cimport pylibcudf.libcudf.table.table_view as cudf_table_view @@ -10,6 +9,7 @@ from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type @@ -72,7 +72,7 @@ cdef extern from "cudf/io/types.hpp" \ vector[column_name_info] children cdef cppclass table_metadata: - table_metadata() except + + table_metadata() except +libcudf_exception_handler map[string, string] user_data vector[unordered_map[string, string]] per_file_user_data @@ -97,8 +97,10 @@ cdef extern from "cudf/io/types.hpp" \ string get_name() cdef cppclass table_input_metadata: - table_input_metadata() except + - table_input_metadata(const cudf_table_view.table_view& table) except + + table_input_metadata() except +libcudf_exception_handler + table_input_metadata( + const cudf_table_view.table_view& table + ) except +libcudf_exception_handler vector[column_in_metadata] column_metadata @@ -107,7 +109,9 @@ cdef extern from "cudf/io/types.hpp" \ size_type num_rows partition_info() - partition_info(size_type start_row, size_type num_rows) except + + partition_info( + size_type start_row, size_type num_rows + ) except +libcudf_exception_handler cdef cppclass host_buffer: const char* data @@ -117,21 +121,33 @@ cdef extern from "cudf/io/types.hpp" \ host_buffer(const char* data, size_t size) cdef cppclass source_info: - const vector[string]& filepaths() except + - - source_info() except + - source_info(const vector[string] &filepaths) except + - source_info(const vector[host_buffer] &host_buffers) except + - source_info(cudf_io_datasource.datasource *source) except + - source_info(const vector[cudf_io_datasource.datasource*] &datasources) except + + const vector[string]& filepaths() except +libcudf_exception_handler + + source_info() except +libcudf_exception_handler + source_info( + const vector[string] &filepaths + ) except +libcudf_exception_handler + source_info( + const vector[host_buffer] &host_buffers + ) except +libcudf_exception_handler + source_info( + cudf_io_datasource.datasource *source + ) except +libcudf_exception_handler + source_info( + const vector[cudf_io_datasource.datasource*] &datasources + ) except +libcudf_exception_handler cdef cppclass sink_info: const vector[string]& filepaths() const vector[cudf_io_data_sink.data_sink *]& user_sinks() - sink_info() except + - sink_info(string file_path) except + - sink_info(vector[string] file_path) except + - sink_info(vector[char] * buffer) except + - sink_info(cudf_io_data_sink.data_sink * user_sink) except + - sink_info(vector[cudf_io_data_sink.data_sink *] user_sink) except + + sink_info() except +libcudf_exception_handler + sink_info(string file_path) except +libcudf_exception_handler + sink_info(vector[string] file_path) except +libcudf_exception_handler + sink_info(vector[char] * buffer) except +libcudf_exception_handler + sink_info( + cudf_io_data_sink.data_sink * user_sink + ) except +libcudf_exception_handler + sink_info( + vector[cudf_io_data_sink.data_sink *] user_sink + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd index 6f6c145b23c..5a36b05fd9f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd @@ -1,15 +1,19 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libc.stddef cimport size_t from libcpp cimport bool from libcpp.memory cimport unique_ptr +from libcpp.optional cimport optional from libcpp.pair cimport pair from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.expressions cimport expression from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport null_equality, size_type -from rmm._lib.device_uvector cimport device_uvector +from rmm.librmm.device_uvector cimport device_uvector ctypedef unique_ptr[device_uvector[size_type]] gather_map_type ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type @@ -18,59 +22,169 @@ cdef extern from "cudf/join.hpp" namespace "cudf" nogil: cdef gather_map_pair_type inner_join( const table_view left_keys, const table_view right_keys, - ) except + + ) except +libcudf_exception_handler cdef gather_map_pair_type left_join( const table_view left_keys, const table_view right_keys, - ) except + + ) except +libcudf_exception_handler cdef gather_map_pair_type full_join( const table_view left_keys, const table_view right_keys, - ) except + + ) except +libcudf_exception_handler cdef gather_map_type left_semi_join( const table_view left_keys, const table_view right_keys, - ) except + + ) except +libcudf_exception_handler cdef gather_map_type left_anti_join( const table_view left_keys, const table_view right_keys, - ) except + + ) except +libcudf_exception_handler cdef gather_map_pair_type inner_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, - ) except + + ) except +libcudf_exception_handler cdef gather_map_pair_type left_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, - ) except + + ) except +libcudf_exception_handler cdef gather_map_pair_type full_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, - ) except + + ) except +libcudf_exception_handler cdef gather_map_type left_semi_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, - ) except + + ) except +libcudf_exception_handler cdef gather_map_type left_anti_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[table] cross_join( const table_view left, const table_view right, ) except + + + cdef gather_map_pair_type conditional_inner_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_inner_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_left_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_left_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_full_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_full_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_type conditional_left_semi_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_type conditional_left_semi_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_type conditional_left_anti_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_type conditional_left_anti_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_pair_type mixed_inner_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler + + cdef gather_map_pair_type mixed_left_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler + + cdef gather_map_pair_type mixed_full_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler + + cdef gather_map_type mixed_left_semi_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler + + cdef gather_map_type mixed_left_anti_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/json.pxd b/python/pylibcudf/pylibcudf/libcudf/json.pxd new file mode 100644 index 00000000000..d5bdd6d299a --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/json.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar + + +cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil: + cdef cppclass get_json_object_options: + get_json_object_options() except +libcudf_exception_handler + # getters + bool get_allow_single_quotes() except +libcudf_exception_handler + bool get_strip_quotes_from_single_strings() except +libcudf_exception_handler + bool get_missing_fields_as_nulls() except +libcudf_exception_handler + # setters + void set_allow_single_quotes(bool val) except +libcudf_exception_handler + void set_strip_quotes_from_single_strings( + bool val + ) except +libcudf_exception_handler + void set_missing_fields_as_nulls(bool val) except +libcudf_exception_handler + + cdef unique_ptr[column] get_json_object( + column_view col, + string_scalar json_path, + get_json_object_options options, + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd index 400c4282f7a..e5dbec879ce 100644 --- a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. from libcpp cimport int from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -16,4 +17,4 @@ cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil: inclusive left_inclusive, const column_view &right_edges, inclusive right_inclusive - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt new file mode 100644 index 00000000000..c896db2c85a --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/lists/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources combine.pyx contains.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_lists +) diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd index d077958ce03..3e4c88d62b0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd @@ -1,6 +1,8 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table_view cimport table_view @@ -9,20 +11,19 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: - ctypedef enum concatenate_null_policy: - IGNORE "cudf::lists::concatenate_null_policy::IGNORE" - NULLIFY_OUTPUT_ROW \ - "cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW" + cpdef enum class concatenate_null_policy(int32_t): + IGNORE + NULLIFY_OUTPUT_ROW cdef unique_ptr[column] concatenate_rows( const table_view input_table - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] concatenate_list_elements( const table_view input_table, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] concatenate_list_elements( const column_view input_table, concatenate_null_policy null_policy - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd index 81a5ad46389..13a32d46c7a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd @@ -1,5 +1,4 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd index e283551ed0c..64c75ccabd3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd @@ -1,9 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil: - cdef unique_ptr[column] count_elements(const lists_column_view&) except + + cdef unique_ptr[column] count_elements( + const lists_column_view& + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd index c64b2715cca..adec02caad1 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type @@ -10,4 +10,4 @@ cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil: cdef unique_ptr[table] explode_outer( const table_view, size_type explode_column_idx, - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd index 2ea060d87de..046bb51c68e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.types cimport size_type @@ -10,8 +10,8 @@ cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] extract_list_element( const lists_column_view&, size_type - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] extract_list_element( const lists_column_view&, const column_view& - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd index 54f5a8409b6..35e2559d902 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,10 +9,10 @@ cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] sequences( const column_view& starts, const column_view& sizes, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] sequences( const column_view& starts, const column_view& steps, const column_view& sizes, - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd index a762c6aa333..69d5eda7e7e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view @@ -9,4 +9,4 @@ cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] segmented_gather( const lists_column_view& source_column, const lists_column_view& gather_map_list - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd index f43340a78b0..917245b3bef 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column_view cimport ( column_view, mutable_column_view, @@ -9,13 +9,19 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil: cdef cppclass lists_column_view(column_view): - lists_column_view() except + - lists_column_view(const lists_column_view& lists_column) except + - lists_column_view(const column_view& lists_column) except + - lists_column_view& operator=(const lists_column_view&) except + - column_view parent() except + - column_view offsets() except + - column_view child() except + + lists_column_view() except +libcudf_exception_handler + lists_column_view( + const lists_column_view& lists_colum + ) except +libcudf_exception_handler + lists_column_view( + const column_view& lists_column + ) except +libcudf_exception_handler + lists_column_view& operator=( + const lists_column_view& + ) except +libcudf_exception_handler + column_view parent() except +libcudf_exception_handler + column_view offsets() except +libcudf_exception_handler + column_view child() except +libcudf_exception_handler cdef enum: offsets_column_index "cudf::lists_column_view::offsets_column_index" diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd index 43b671ebfa0..1ae3b4409ef 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view @@ -8,4 +8,4 @@ from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] reverse( const lists_column_view& lists_column, - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd index 266f04ef6b3..1f4855bdbf3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.types cimport nan_equality, null_equality @@ -12,25 +12,25 @@ cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil: const lists_column_view& rhs, null_equality nulls_equal, nan_equality nans_equal - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] have_overlap( const lists_column_view& lhs, const lists_column_view& rhs, null_equality nulls_equal, nan_equality nans_equal - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] intersect_distinct( const lists_column_view& lhs, const lists_column_view& rhs, null_equality nulls_equal, nan_equality nans_equal - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] union_distinct( const lists_column_view& lhs, const lists_column_view& rhs, null_equality nulls_equal, nan_equality nans_equal - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd index ea45f999c47..344b55b402f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.types cimport null_order, order @@ -11,10 +11,10 @@ cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil: const lists_column_view source_column, order column_order, null_order null_precedence - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] stable_sort_lists( const lists_column_view source_column, order column_order, null_order null_precedence - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd index d9df7c3ca2e..8341ac69bf5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.types cimport nan_equality, null_equality @@ -11,10 +11,10 @@ cdef extern from "cudf/lists/stream_compaction.hpp" \ cdef unique_ptr[column] apply_boolean_mask( const lists_column_view& lists_column, const lists_column_view& boolean_mask, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] distinct( const lists_column_view& lists_column, null_equality nulls_equal, nan_equality nans_equal - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/merge.pxd b/python/pylibcudf/pylibcudf/libcudf/merge.pxd index 6930b7a0d06..f546ae3bbdd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/merge.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/merge.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.types as libcudf_types from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view @@ -13,4 +13,4 @@ cdef extern from "cudf/merge.hpp" namespace "cudf" nogil: vector[libcudf_types.size_type] key_cols, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence, - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd index 5f582091b06..5b49ddc3bbe 100644 --- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd @@ -1,32 +1,32 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t from libcpp.pair cimport pair +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil: cdef device_buffer copy_bitmask "cudf::copy_bitmask" ( column_view view - ) except + + ) except +libcudf_exception_handler cdef size_t bitmask_allocation_size_bytes ( size_type number_of_bits, size_t padding_boundary - ) except + + ) except +libcudf_exception_handler cdef size_t bitmask_allocation_size_bytes ( size_type number_of_bits - ) except + + ) except +libcudf_exception_handler cdef device_buffer create_null_mask ( size_type size, mask_state state - ) except + + ) except +libcudf_exception_handler cdef pair[device_buffer, size_type] bitmask_and( table_view view diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd index fd768d22704..c835c8249ca 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -14,10 +14,10 @@ cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil: cdef unique_ptr[bpe_merge_pairs] load_merge_pairs( const column_view &merge_pairs - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] byte_pair_encoding( const column_view &strings, const bpe_merge_pairs &merge_pairs, const string_scalar &separator - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd index d459372fb8f..fbb1c0b2f4c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -11,8 +11,8 @@ cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] edit_distance( const column_view & strings, const column_view & targets - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] edit_distance_matrix( const column_view & strings - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd index eefae746662..c7bd4da5441 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -13,14 +13,14 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil: const column_view &strings, size_type ngrams, const string_scalar & separator - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] generate_character_ngrams( const column_view &strings, size_type ngrams - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] hash_character_ngrams( const column_view &strings, size_type ngrams - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd index 16c5f7f575e..d40943f2649 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type @@ -12,4 +12,4 @@ cdef extern from "nvtext/jaccard.hpp" namespace "nvtext" nogil: const column_view &input1, const column_view &input2, size_type width - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index f2dd22f43aa..8570531dfde 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -1,31 +1,61 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. - +from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: + cdef unique_ptr[column] minhash( + const column_view &strings, + const numeric_scalar[uint32_t] seed, + const size_type width, + ) except +libcudf_exception_handler + cdef unique_ptr[column] minhash( const column_view &strings, const column_view &seeds, const size_type width, + ) except +libcudf_exception_handler + + cdef unique_ptr[column] minhash_permuted( + const column_view &strings, + const uint32_t seed, + const column_view &a, + const column_view &b, + const size_type width, ) except + cdef unique_ptr[column] minhash64( const column_view &strings, const column_view &seeds, const size_type width, + ) except +libcudf_exception_handler + + cdef unique_ptr[column] minhash64( + const column_view &strings, + const numeric_scalar[uint64_t] seed, + const size_type width, + ) except +libcudf_exception_handler + + cdef unique_ptr[column] minhash64_permuted( + const column_view &strings, + const uint64_t seed, + const column_view &a, + const column_view &b, + const size_type width, ) except + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] word_minhash64( const column_view &input, const column_view &seeds - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd index 89f6e5edfc4..fae8fd1e59a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -14,4 +14,4 @@ cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil: size_type ngrams, const string_scalar & delimiter, const string_scalar & separator - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd index cbf121920e1..f8b082c8429 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -10,9 +10,9 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] normalize_spaces( const column_view & strings - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] normalize_characters( const column_view & strings, bool do_lower_case - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd index 6bcfa1d9380..82983aaf618 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -14,11 +14,11 @@ cdef extern from "nvtext/replace.hpp" namespace "nvtext" nogil: const column_view & targets, const column_view & replacements, const string_scalar & delimiter - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] filter_tokens( const column_view & strings, size_type min_token_length, const string_scalar & replacement, const string_scalar & delimiter - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd index 673bffa28ae..1f944d95701 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd @@ -1,29 +1,30 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil: - ctypedef enum letter_type: - CONSONANT 'nvtext::letter_type::CONSONANT' - VOWEL 'nvtext::letter_type::VOWEL' + cpdef enum class letter_type: + CONSONANT + VOWEL cdef unique_ptr[column] porter_stemmer_measure( const column_view & strings - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] is_letter( column_view source_strings, letter_type ltype, - size_type character_index) except + + size_type character_index) except +libcudf_exception_handler cdef unique_ptr[column] is_letter( column_view source_strings, letter_type ltype, - column_view indices) except + + column_view indices) except +libcudf_exception_handler ctypedef int32_t underlying_type_t_letter_type diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd index aabac0a617b..1ac69c87c4b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd @@ -1,22 +1,22 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport uint16_t, uint32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil: - cdef cppclass tokenizer_result "nvtext::tokenizer_result": + cdef cppclass tokenizer_result: uint32_t nrows_tensor uint32_t sequence_length unique_ptr[column] tensor_token_ids unique_ptr[column] tensor_attention_mask unique_ptr[column] tensor_metadata - cdef struct hashed_vocabulary "nvtext::hashed_vocabulary": + cdef cppclass hashed_vocabulary: uint16_t first_token_id uint16_t separator_token_id uint16_t unknown_token_id @@ -26,19 +26,21 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil: unique_ptr[column] table unique_ptr[column] bin_coefficients unique_ptr[column] bin_offsets + unique_ptr[column] cp_metadata + unique_ptr[column] aux_cp_table cdef unique_ptr[hashed_vocabulary] load_vocabulary_file( const string &filename_hashed_vocabulary - ) except + + ) except +libcudf_exception_handler cdef tokenizer_result subword_tokenize( const column_view & strings, - hashed_vocabulary & hashed_vocablary_obj, + hashed_vocabulary & hashed_vocabulary_obj, uint32_t max_sequence_length, uint32_t stride, bool do_lower, bool do_truncate - ) except + + ) except +libcudf_exception_handler cdef tokenizer_result subword_tokenize( const column_view &strings, @@ -47,7 +49,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil: uint32_t stride, bool do_lower, bool do_truncate - ) except + + ) except +libcudf_exception_handler cdef extern from "" namespace "std" nogil: cdef tokenizer_result move(tokenizer_result) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd index 34c054cf36f..a8f9d0451bc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -12,43 +12,43 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] tokenize( const column_view & strings, const string_scalar & delimiter - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] tokenize( const column_view & strings, const column_view & delimiters - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] count_tokens( const column_view & strings, const string_scalar & delimiter - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] count_tokens( const column_view & strings, const column_view & delimiters - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] character_tokenize( const column_view & strings - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] detokenize( const column_view & strings, const column_view & row_indices, const string_scalar & separator - ) except + + ) except +libcudf_exception_handler cdef struct tokenize_vocabulary "nvtext::tokenize_vocabulary": pass cdef unique_ptr[tokenize_vocabulary] load_vocabulary( const column_view & strings - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] tokenize_with_vocabulary( const column_view & strings, const tokenize_vocabulary & vocabulary, const string_scalar & delimiter, size_type default_id - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd index 89bddbffab5..04566b6e40a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd @@ -1,10 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.types as libcudf_types from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table cimport table @@ -17,18 +17,18 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil: const table_view& input, const vector[libcudf_types.size_type]& columns_to_hash, int num_partitions - ) except + + ) except +libcudf_exception_handler cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \ partition "cudf::partition" ( const table_view& t, const column_view& partition_map, int num_partitions - ) except + + ) except +libcudf_exception_handler cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \ round_robin_partition "cudf::round_robin_partition" ( const table_view& input, int num_partitions, int start_partition - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd index cf2350fc36c..8f60302e776 100644 --- a/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table cimport table @@ -24,7 +24,7 @@ cdef extern from "cudf/quantiles.hpp" namespace "cudf" nogil: interpolation interp, column_view ordered_indices, bool exact, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[table] quantiles ( table_view source_table, @@ -33,4 +33,4 @@ cdef extern from "cudf/quantiles.hpp" namespace "cudf" nogil: sorted is_input_sorted, vector[order] column_order, vector[null_order] null_precedence, - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/reduce.pxd b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd index 6d2f4bd23d1..ad79187b733 100644 --- a/python/pylibcudf/pylibcudf/libcudf/reduce.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport pair +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.aggregation cimport reduce_aggregation, scan_aggregation from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -15,7 +15,7 @@ cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil: column_view col, const reduce_aggregation& agg, data_type type - ) except + + ) except +libcudf_exception_handler cpdef enum class scan_type(bool): INCLUSIVE "cudf::scan_type::INCLUSIVE", @@ -25,9 +25,9 @@ cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil: column_view col, const scan_aggregation& agg, scan_type inclusive - ) except + + ) except +libcudf_exception_handler cdef pair[unique_ptr[scalar], unique_ptr[scalar]] cpp_minmax "cudf::minmax" ( column_view col - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/replace.pxd index 4ac44fc946e..bef5a25367b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/replace.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/replace.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport ( column_view, @@ -18,32 +18,32 @@ cdef extern from "cudf/replace.hpp" namespace "cudf" nogil: cdef unique_ptr[column] replace_nulls( column_view source_column, - column_view replacement_column) except + + column_view replacement_column) except +libcudf_exception_handler cdef unique_ptr[column] replace_nulls( column_view source_column, - scalar replacement) except + + scalar replacement) except +libcudf_exception_handler cdef unique_ptr[column] replace_nulls( column_view source_column, - replace_policy replace_policy) except + + replace_policy replace_policy) except +libcudf_exception_handler cdef unique_ptr[column] find_and_replace_all( column_view source_column, column_view values_to_replace, - column_view replacement_values) except + + column_view replacement_values) except +libcudf_exception_handler cdef unique_ptr[column] clamp( column_view source_column, scalar lo, scalar lo_replace, - scalar hi, scalar hi_replace) except + + scalar hi, scalar hi_replace) except +libcudf_exception_handler cdef unique_ptr[column] clamp( column_view source_column, - scalar lo, scalar hi) except + + scalar lo, scalar hi) except +libcudf_exception_handler cdef unique_ptr[column] normalize_nans_and_zeros( - column_view source_column) except + + column_view source_column) except +libcudf_exception_handler cdef void normalize_nans_and_zeros( - mutable_column_view source_column) except + + mutable_column_view source_column) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/reshape.pxd b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd index 446a082ab1b..92ab4773940 100644 --- a/python/pylibcudf/pylibcudf/libcudf/reshape.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view @@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil: cdef unique_ptr[column] interleave_columns( table_view source_table - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[table] tile( table_view source_table, size_type count - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/rolling.pxd b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd index 9e76faa0eba..0fd7eeb73c0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/rolling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.aggregation cimport rolling_aggregation from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -13,11 +13,11 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil: column_view preceding_window, column_view following_window, size_type min_periods, - rolling_aggregation& agg) except + + rolling_aggregation& agg) except +libcudf_exception_handler cdef unique_ptr[column] rolling_window( column_view source, size_type preceding_window, size_type following_window, size_type min_periods, - rolling_aggregation& agg) except + + rolling_aggregation& agg) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/round.pxd b/python/pylibcudf/pylibcudf/libcudf/round.pxd index 1b65133f275..efd9e3de25d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/round.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/round.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -16,4 +16,4 @@ cdef extern from "cudf/round.hpp" namespace "cudf" nogil: const column_view& input, int32_t decimal_places, rounding_method method, - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd index 4b40a8a26f6..2c67dc325af 100644 --- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd @@ -1,74 +1,76 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t, int64_t from libcpp cimport bool from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport data_type -from pylibcudf.libcudf.wrappers.decimals cimport scale_type cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: cdef cppclass scalar: - scalar() except + - scalar(scalar other) except + - data_type type() except + - void set_valid_async(bool is_valid) except + - bool is_valid() except + + scalar() except +libcudf_exception_handler + scalar(scalar other) except +libcudf_exception_handler + data_type type() except +libcudf_exception_handler + void set_valid_async(bool is_valid) except +libcudf_exception_handler + bool is_valid() except +libcudf_exception_handler cdef cppclass numeric_scalar[T](scalar): - numeric_scalar() except + - numeric_scalar(numeric_scalar other) except + - numeric_scalar(T value) except + - numeric_scalar(T value, bool is_valid) except + - void set_value(T value) except + - T value() except + + numeric_scalar() except +libcudf_exception_handler + numeric_scalar(numeric_scalar other) except +libcudf_exception_handler + numeric_scalar(T value) except +libcudf_exception_handler + numeric_scalar(T value, bool is_valid) except +libcudf_exception_handler + void set_value(T value) except +libcudf_exception_handler + T value() except +libcudf_exception_handler cdef cppclass timestamp_scalar[T](scalar): - timestamp_scalar() except + - timestamp_scalar(timestamp_scalar other) except + - timestamp_scalar(int64_t value) except + - timestamp_scalar(int64_t value, bool is_valid) except + - timestamp_scalar(int32_t value) except + - timestamp_scalar(int32_t value, bool is_valid) except + - int64_t ticks_since_epoch_64 "ticks_since_epoch"() except + - int32_t ticks_since_epoch_32 "ticks_since_epoch"() except + - T value() except + + timestamp_scalar() except +libcudf_exception_handler + timestamp_scalar(timestamp_scalar other) except +libcudf_exception_handler + timestamp_scalar(int64_t value) except +libcudf_exception_handler + timestamp_scalar(int64_t value, bool is_valid) except +libcudf_exception_handler + timestamp_scalar(int32_t value) except +libcudf_exception_handler + timestamp_scalar(int32_t value, bool is_valid) except +libcudf_exception_handler + int64_t ticks_since_epoch_64 "ticks_since_epoch"()\ + except +libcudf_exception_handler + int32_t ticks_since_epoch_32 "ticks_since_epoch"()\ + except +libcudf_exception_handler + T value() except +libcudf_exception_handler cdef cppclass duration_scalar[T](scalar): - duration_scalar() except + - duration_scalar(duration_scalar other) except + - duration_scalar(int64_t value) except + - duration_scalar(int64_t value, bool is_valid) except + - duration_scalar(int32_t value) except + - duration_scalar(int32_t value, bool is_valid) except + - int64_t ticks "count"() except + - T value() except + + duration_scalar() except +libcudf_exception_handler + duration_scalar(duration_scalar other) except +libcudf_exception_handler + duration_scalar(int64_t value) except +libcudf_exception_handler + duration_scalar(int64_t value, bool is_valid) except +libcudf_exception_handler + duration_scalar(int32_t value) except +libcudf_exception_handler + duration_scalar(int32_t value, bool is_valid) except +libcudf_exception_handler + int64_t ticks "count"() except +libcudf_exception_handler + T value() except +libcudf_exception_handler cdef cppclass string_scalar(scalar): - string_scalar() except + - string_scalar(string st) except + - string_scalar(string st, bool is_valid) except + - string_scalar(string_scalar other) except + - string to_string() except + + string_scalar() except +libcudf_exception_handler + string_scalar(string st) except +libcudf_exception_handler + string_scalar(string st, bool is_valid) except +libcudf_exception_handler + string_scalar(string_scalar other) except +libcudf_exception_handler + string to_string() except +libcudf_exception_handler cdef cppclass fixed_point_scalar[T](scalar): - fixed_point_scalar() except + + fixed_point_scalar() except +libcudf_exception_handler fixed_point_scalar(int64_t value, scale_type scale, - bool is_valid) except + + bool is_valid) except +libcudf_exception_handler fixed_point_scalar(data_type value, scale_type scale, - bool is_valid) except + - int64_t value() except + + bool is_valid) except +libcudf_exception_handler + int64_t value() except +libcudf_exception_handler # TODO: Figure out how to add an int32 overload of value() cdef cppclass list_scalar(scalar): - list_scalar(column_view col) except + - list_scalar(column_view col, bool is_valid) except + - column_view view() except + + list_scalar(column_view col) except +libcudf_exception_handler + list_scalar(column_view col, bool is_valid) except +libcudf_exception_handler + column_view view() except +libcudf_exception_handler cdef cppclass struct_scalar(scalar): - struct_scalar(table_view cols, bool valid) except + - table_view view() except + + struct_scalar(table_view cols, bool valid) except +libcudf_exception_handler + table_view view() except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd index ee4b47935b2..9fb907970de 100644 --- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd @@ -1,13 +1,19 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: - cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + - cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except + + cdef unique_ptr[scalar] make_string_scalar( + const string & _string + ) except +libcudf_exception_handler + cdef unique_ptr[scalar] make_fixed_width_scalar[T]( + T value + ) except +libcudf_exception_handler - cdef unique_ptr[scalar] make_empty_scalar_like(const column_view &) except + + cdef unique_ptr[scalar] make_empty_scalar_like( + const column_view & + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/search.pxd b/python/pylibcudf/pylibcudf/libcudf/search.pxd index 5a6ad5384c9..5ec06858baa 100644 --- a/python/pylibcudf/pylibcudf/libcudf/search.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/search.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.types as libcudf_types from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table_view cimport table_view @@ -15,16 +15,16 @@ cdef extern from "cudf/search.hpp" namespace "cudf" nogil: table_view needles, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] upper_bound( table_view haystack, table_view needles, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] contains( column_view haystack, column_view needles, - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd index 9e899855486..342545a0eec 100644 --- a/python/pylibcudf/pylibcudf/libcudf/sorting.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd @@ -1,9 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - cimport pylibcudf.libcudf.types as libcudf_types from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.aggregation cimport rank_method from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -15,12 +15,14 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: cdef unique_ptr[column] sorted_order( table_view source_table, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler cdef unique_ptr[column] stable_sorted_order( table_view source_table, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler cdef unique_ptr[column] rank( column_view input_view, @@ -28,45 +30,52 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: libcudf_types.order column_order, libcudf_types.null_policy null_handling, libcudf_types.null_order null_precedence, - bool percentage) except + + bool percentage) except +libcudf_exception_handler cdef bool is_sorted( const table_view& table, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler cdef unique_ptr[table] segmented_sort_by_key( const table_view& values, const table_view& keys, const column_view& segment_offsets, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler cdef unique_ptr[table] stable_segmented_sort_by_key( const table_view& values, const table_view& keys, const column_view& segment_offsets, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler cdef unique_ptr[table] sort_by_key( const table_view& values, const table_view& keys, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler cdef unique_ptr[table] stable_sort_by_key( const table_view& values, const table_view& keys, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler cdef unique_ptr[table] sort( table_view source_table, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler cdef unique_ptr[table] stable_sort( table_view source_table, vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence) except + + vector[libcudf_types.null_order] null_precedence + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd index 7830c9478c2..78b9bcb299b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table cimport table @@ -23,25 +23,29 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: KEEP_LAST KEEP_NONE - cdef unique_ptr[table] drop_nulls(table_view source_table, - vector[size_type] keys, - size_type keep_threshold) except + + cdef unique_ptr[table] drop_nulls( + table_view source_table, + vector[size_type] keys, + size_type keep_threshold + ) except +libcudf_exception_handler - cdef unique_ptr[table] drop_nans(table_view source_table, - vector[size_type] keys, - size_type keep_threshold) except + + cdef unique_ptr[table] drop_nans( + table_view source_table, + vector[size_type] keys, + size_type keep_threshold + ) except +libcudf_exception_handler cdef unique_ptr[table] apply_boolean_mask( table_view source_table, column_view boolean_mask - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[table] unique( table_view input, vector[size_type] keys, duplicate_keep_option keep, null_equality nulls_equal, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[table] distinct( table_view input, @@ -49,14 +53,14 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equals, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] distinct_indices( table_view input, duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[table] stable_distinct( table_view input, @@ -64,22 +68,22 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, - ) except + + ) except +libcudf_exception_handler cdef size_type unique_count( column_view column, null_policy null_handling, - nan_policy nan_handling) except + + nan_policy nan_handling) except +libcudf_exception_handler cdef size_type unique_count( table_view source_table, - null_policy null_handling) except + + null_policy null_handling) except +libcudf_exception_handler cdef size_type distinct_count( column_view column, null_policy null_handling, - nan_policy nan_handling) except + + nan_policy nan_handling) except +libcudf_exception_handler cdef size_type distinct_count( table_view source_table, - null_policy null_handling) except + + null_policy null_handling) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index abf4357f862..f5f2113332a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx) +set(cython_sources char_types.pyx combine.pyx regex_flags.pyx side_type.pyx translate.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd index 5e510339834..1cf3c912f95 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -8,10 +8,10 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/attributes.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] count_characters( - column_view source_strings) except + + column_view source_strings) except +libcudf_exception_handler cdef unique_ptr[column] count_bytes( - column_view source_strings) except + + column_view source_strings) except +libcudf_exception_handler cdef unique_ptr[column] code_points( - column_view source_strings) except + + column_view source_strings) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd index 77e3f46d7ee..a3815757d2d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -10,12 +11,12 @@ cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] capitalize( const column_view & strings, const string_scalar & delimiters - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] title( const column_view & strings, string_character_types sequence_type - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] is_title( - const column_view & strings) except + + const column_view & strings) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd index 7869e90f387..7e60476b87b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd @@ -1,21 +1,22 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] capitalize( - const column_view & input) except + + const column_view & input) except +libcudf_exception_handler cdef unique_ptr[column] is_title( - const column_view & input) except + + const column_view & input) except +libcudf_exception_handler cdef unique_ptr[column] to_lower( - const column_view & strings) except + + const column_view & strings) except +libcudf_exception_handler cdef unique_ptr[column] to_upper( - const column_view & strings) except + + const column_view & strings) except +libcudf_exception_handler cdef unique_ptr[column] swapcase( - const column_view & strings) except + + const column_view & strings) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd index 5d54c1c3593..6a0ae06c08a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -22,16 +22,13 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \ CASE_TYPES ALL_TYPES -cdef extern from "cudf/strings/char_types/char_types.hpp" \ - namespace "cudf::strings" nogil: - cdef unique_ptr[column] all_characters_of_type( column_view source_strings, string_character_types types, - string_character_types verify_types) except + + string_character_types verify_types) except +libcudf_exception_handler cdef unique_ptr[column] filter_characters_of_type( column_view source_strings, string_character_types types_to_remove, string_scalar replacement, - string_character_types types_to_keep) except + + string_character_types types_to_keep) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd index e4c9fa5817a..90be281429b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd @@ -1,6 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libcpp cimport int from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -9,23 +11,31 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: - ctypedef enum separator_on_nulls: - YES 'cudf::strings::separator_on_nulls::YES' - NO 'cudf::strings::separator_on_nulls::NO' + cpdef enum class separator_on_nulls(int): + YES + NO - ctypedef enum output_if_empty_list: - EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING' - NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT' + cpdef enum class output_if_empty_list(int): + EMPTY_STRING + NULL_ELEMENT cdef unique_ptr[column] concatenate( - table_view source_strings, + table_view strings_columns, string_scalar separator, - string_scalar narep) except + + string_scalar narep, + separator_on_nulls separate_nulls) except +libcudf_exception_handler + + cdef unique_ptr[column] concatenate( + table_view strings_columns, + column_view separators, + string_scalar separator_narep, + string_scalar col_narep, + separator_on_nulls separate_nulls) except +libcudf_exception_handler cdef unique_ptr[column] join_strings( - column_view source_strings, + column_view input, string_scalar separator, - string_scalar narep) except + + string_scalar narep) except +libcudf_exception_handler cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, @@ -33,11 +43,11 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: string_scalar separator_narep, string_scalar string_narep, separator_on_nulls separate_nulls, - output_if_empty_list empty_list_policy) except + + output_if_empty_list empty_list_policy) except +libcudf_exception_handler cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, string_scalar separator, string_scalar narep, separator_on_nulls separate_nulls, - output_if_empty_list empty_list_policy) except + + output_if_empty_list empty_list_policy) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd index eac0f748257..8eb287c6b06 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -11,22 +11,22 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] contains_re( column_view source_strings, - regex_program) except + + regex_program) except +libcudf_exception_handler cdef unique_ptr[column] count_re( column_view source_strings, - regex_program) except + + regex_program) except +libcudf_exception_handler cdef unique_ptr[column] matches_re( column_view source_strings, - regex_program) except + + regex_program) except +libcudf_exception_handler cdef unique_ptr[column] like( column_view source_strings, string_scalar pattern, - string_scalar escape_character) except + + string_scalar escape_character) except +libcudf_exception_handler cdef unique_ptr[column] like( column_view source_strings, column_view patterns, - string_scalar escape_character) except + + string_scalar escape_character) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd index 83a9573baad..37f39b098b3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -8,10 +9,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_booleans( - column_view input_col, - string_scalar true_string) except + + column_view input, + string_scalar true_string) except +libcudf_exception_handler cdef unique_ptr[column] from_booleans( - column_view input_col, + column_view booleans, string_scalar true_string, - string_scalar false_string) except + + string_scalar false_string) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd index fa8975c4df9..c316b7891a3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -10,15 +10,15 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_timestamps( - column_view input_col, + column_view input, data_type timestamp_type, - string format) except + + string format) except +libcudf_exception_handler cdef unique_ptr[column] from_timestamps( - column_view input_col, + column_view timestamps, string format, - column_view input_strings_names) except + + column_view names) except +libcudf_exception_handler cdef unique_ptr[column] is_timestamp( column_view input_col, - string format) except + + string format) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd index ebe10574353..75374208172 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -10,10 +10,10 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_durations( - const column_view & strings_col, + const column_view & input, data_type duration_type, - const string & format) except + + const string & format) except +libcudf_exception_handler cdef unique_ptr[column] from_durations( const column_view & durations, - const string & format) except + + const string & format) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd index 6f820f3c9a4..71c866ad211 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -9,13 +9,13 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_fixed_point( - column_view input_col, - data_type output_type) except + + column_view input, + data_type output_type) except +libcudf_exception_handler cdef unique_ptr[column] from_fixed_point( - column_view input_col) except + + column_view input) except +libcudf_exception_handler cdef unique_ptr[column] is_fixed_point( - column_view source_strings, - data_type output_type - ) except + + column_view input, + data_type decimal_type + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd index f4fc4674506..7df6b914458 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_floats( - column_view input_col, - data_type output_type) except + + column_view strings, + data_type output_type) except +libcudf_exception_handler cdef unique_ptr[column] from_floats( - column_view input_col) except + + column_view floats) except +libcudf_exception_handler cdef unique_ptr[column] is_float( - column_view source_strings - ) except + + column_view input + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd index f12aab0a2e4..4033ef51480 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -9,23 +9,28 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_integers( - column_view input_col, - data_type output_type) except + + column_view input, + data_type output_type) except +libcudf_exception_handler cdef unique_ptr[column] from_integers( - column_view input_col) except + + column_view integers) except +libcudf_exception_handler + + cdef unique_ptr[column] is_integer( + column_view input + ) except +libcudf_exception_handler cdef unique_ptr[column] is_integer( - column_view source_strings - ) except + + column_view input, + data_type int_type + ) except +libcudf_exception_handler cdef unique_ptr[column] hex_to_integers( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] is_hex( - column_view source_strings - ) except + + column_view input + ) except +libcudf_exception_handler cdef unique_ptr[column] integers_to_hex( - column_view input_col) except + + column_view input) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd index fe571cfced6..33f9c798ae6 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -8,11 +8,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] ipv4_to_integers( - column_view input_col) except + + column_view input) except +libcudf_exception_handler cdef unique_ptr[column] integers_to_ipv4( - column_view input_col) except + + column_view integers) except +libcudf_exception_handler cdef unique_ptr[column] is_ipv4( - column_view source_strings - ) except + + column_view input + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd index 109111568d8..3d0a677424e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -9,6 +10,6 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] format_list_column( - column_view input_col, + column_view input, string_scalar na_rep, - column_view separators) except + + column_view separators) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd index 5c07b698454..03a14e215e0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] url_encode( - column_view input_col) except + + column_view input) except +libcudf_exception_handler cdef unique_ptr[column] url_decode( - column_view input_col) except + + column_view input) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd index b7166167cfd..18608554921 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.strings.regex_program cimport regex_program @@ -11,8 +11,8 @@ cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[table] extract( column_view input, - regex_program prog) except + + regex_program prog) except +libcudf_exception_handler cdef unique_ptr[column] extract_all_record( column_view input, - regex_program prog) except + + regex_program prog) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd index 1d1df1b8b8e..4082145c5b8 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -12,41 +12,41 @@ cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] contains( column_view source_strings, - string_scalar target) except + + string_scalar target) except +libcudf_exception_handler cdef unique_ptr[column] contains( column_view source_strings, - column_view target_strings) except + + column_view target_strings) except +libcudf_exception_handler cdef unique_ptr[column] ends_with( column_view source_strings, - string_scalar target) except + + string_scalar target) except +libcudf_exception_handler cdef unique_ptr[column] ends_with( column_view source_strings, - column_view target_strings) except + + column_view target_strings) except +libcudf_exception_handler cdef unique_ptr[column] starts_with( column_view source_strings, - string_scalar target) except + + string_scalar target) except +libcudf_exception_handler cdef unique_ptr[column] starts_with( column_view source_strings, - column_view target_strings) except + + column_view target_strings) except +libcudf_exception_handler cdef unique_ptr[column] find( column_view source_strings, string_scalar target, size_type start, - size_type stop) except + + size_type stop) except +libcudf_exception_handler cdef unique_ptr[column] find( column_view source_strings, column_view target, - size_type start) except + + size_type start) except +libcudf_exception_handler cdef unique_ptr[column] rfind( column_view source_strings, string_scalar target, size_type start, - size_type stop) except + + size_type stop) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd index 0491644a10a..b03044db4f4 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] find_multiple( - column_view source_strings, - column_view targets) except + + column_view input, + column_view targets) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd index e0a8b776465..eda68c35e58 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.strings.regex_program cimport regex_program @@ -10,4 +10,8 @@ cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] findall( column_view input, - regex_program prog) except + + regex_program prog) except +libcudf_exception_handler + + cdef unique_ptr[column] find_re( + column_view input, + regex_program prog) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd deleted file mode 100644 index 571ba7be7af..00000000000 --- a/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar - - -cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil: - cdef cppclass get_json_object_options: - get_json_object_options() except + - # getters - bool get_allow_single_quotes() except + - bool get_strip_quotes_from_single_strings() except + - bool get_missing_fields_as_nulls() except + - # setters - void set_allow_single_quotes(bool val) except + - void set_strip_quotes_from_single_strings(bool val) except + - void set_missing_fields_as_nulls(bool val) except + - - cdef unique_ptr[column] get_json_object( - column_view col, - string_scalar json_path, - get_json_object_options options, - ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd index 657fe61eb14..d82f76f98b6 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -12,11 +13,11 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] pad( - column_view source_strings, + column_view input, size_type width, side_type side, - string fill_char) except + + string fill_char) except +libcudf_exception_handler cdef unique_ptr[column] zfill( - column_view source_strings, - size_type width) except + + column_view input, + size_type width) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pxd index 41617f157b7..1aa22107183 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t +from pylibcudf.exception_handler cimport libcudf_exception_handler cdef extern from "cudf/strings/regex/flags.hpp" \ diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd index 5d1d9e583d5..21f52f3de24 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.strings.regex_flags cimport regex_flags @@ -14,4 +14,4 @@ cdef extern from "cudf/strings/regex/regex_program.hpp" \ unique_ptr[regex_program] create( string pattern, regex_flags flags - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd index 59262820411..de65b554eba 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type @@ -11,8 +11,8 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ cdef unique_ptr[column] repeat_strings( column_view input, - size_type repeat_times) except + + size_type repeat_times) except +libcudf_exception_handler cdef unique_ptr[column] repeat_strings( column_view input, - column_view repeat_times) except + + column_view repeat_times) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd index fd5f4fc4751..68743d85712 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -14,15 +14,15 @@ cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil: column_view source_strings, string_scalar repl, size_type start, - size_type stop) except + + size_type stop) except +libcudf_exception_handler cdef unique_ptr[column] replace( column_view source_strings, string_scalar target, string_scalar repl, - int32_t maxrepl) except + + int32_t maxrepl) except +libcudf_exception_handler cdef unique_ptr[column] replace_multiple( column_view source_strings, column_view target_strings, - column_view repl_strings) except + + column_view repl_strings) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd index 40f0e2fa50c..2a7d50346be 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd @@ -1,11 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.regex_flags cimport regex_flags from pylibcudf.libcudf.strings.regex_program cimport regex_program from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type @@ -14,17 +15,18 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] replace_re( - column_view source_strings, - regex_program, - string_scalar repl, - size_type maxrepl) except + - - cdef unique_ptr[column] replace_with_backrefs( - column_view source_strings, - regex_program, - string repl) except + + column_view input, + regex_program prog, + string_scalar replacement, + size_type max_replace_count) except +libcudf_exception_handler cdef unique_ptr[column] replace_re( - column_view source_strings, + column_view input, vector[string] patterns, - column_view repls) except + + column_view replacements, + regex_flags flags) except +libcudf_exception_handler + + cdef unique_ptr[column] replace_with_backrefs( + column_view input, + regex_program prog, + string replacement) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd index 019ff3f17ba..9626763d5af 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd @@ -1,12 +1,11 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. -from libc.stdint cimport int32_t +from libcpp cimport int +from pylibcudf.exception_handler cimport libcudf_exception_handler cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil: - cpdef enum class side_type(int32_t): - LEFT 'cudf::strings::side_type::LEFT' - RIGHT 'cudf::strings::side_type::RIGHT' - BOTH 'cudf::strings::side_type::BOTH' - -ctypedef int32_t underlying_type_t_side_type + cpdef enum class side_type(int): + LEFT + RIGHT + BOTH diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd index 4162e886a7d..d1a2ddbaef4 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] partition( - column_view source_strings, - string_scalar delimiter) except + + column_view input, + string_scalar delimiter) except +libcudf_exception_handler cdef unique_ptr[table] rpartition( - column_view source_strings, - string_scalar delimiter) except + + column_view input, + string_scalar delimiter) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd index 3046149aebb..34fb72a3b33 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -14,45 +14,45 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split( - column_view source_strings, + column_view strings_column, string_scalar delimiter, - size_type maxsplit) except + + size_type maxsplit) except +libcudf_exception_handler cdef unique_ptr[table] rsplit( - column_view source_strings, + column_view strings_column, string_scalar delimiter, - size_type maxsplit) except + + size_type maxsplit) except +libcudf_exception_handler cdef unique_ptr[column] split_record( - column_view source_strings, + column_view strings, string_scalar delimiter, - size_type maxsplit) except + + size_type maxsplit) except +libcudf_exception_handler cdef unique_ptr[column] rsplit_record( - column_view source_strings, + column_view strings, string_scalar delimiter, - size_type maxsplit) except + + size_type maxsplit) except +libcudf_exception_handler cdef extern from "cudf/strings/split/split_re.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split_re( - const column_view& source_strings, - regex_program, - size_type maxsplit) except + + const column_view& input, + regex_program prog, + size_type maxsplit) except +libcudf_exception_handler cdef unique_ptr[table] rsplit_re( - const column_view& source_strings, - regex_program, - size_type maxsplit) except + + const column_view& input, + regex_program prog, + size_type maxsplit) except +libcudf_exception_handler cdef unique_ptr[column] split_record_re( - const column_view& source_strings, - regex_program, - size_type maxsplit) except + + const column_view& input, + regex_program prog, + size_type maxsplit) except +libcudf_exception_handler cdef unique_ptr[column] rsplit_record_re( - const column_view& source_strings, - regex_program, - size_type maxsplit) except + + const column_view& input, + regex_program prog, + size_type maxsplit) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd index b0ca771762d..41751ddff3c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -10,6 +10,6 @@ from pylibcudf.libcudf.strings.side_type cimport side_type cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] strip( - column_view source_strings, - side_type stype, - string_scalar to_strip) except + + column_view input, + side_type side, + string_scalar to_strip) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd index 576dae9387f..f573870583d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar @@ -12,9 +12,9 @@ cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil: column_view source_strings, numeric_scalar[size_type] start, numeric_scalar[size_type] end, - numeric_scalar[size_type] step) except + + numeric_scalar[size_type] step) except +libcudf_exception_handler cdef unique_ptr[column] slice_strings( column_view source_strings, column_view starts, - column_view stops) except + + column_view stops) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd index 85fa719128a..11b63d0ed30 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd @@ -1,9 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar @@ -13,15 +13,16 @@ from pylibcudf.libcudf.types cimport char_utf8 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] translate( - column_view source_strings, - vector[pair[char_utf8, char_utf8]] chars_table) except + + column_view input, + vector[pair[char_utf8, char_utf8]] chars_table + ) except +libcudf_exception_handler - ctypedef enum filter_type: - KEEP 'cudf::strings::filter_type::KEEP', - REMOVE 'cudf::strings::filter_type::REMOVE' + cpdef enum class filter_type(bool): + KEEP + REMOVE cdef unique_ptr[column] filter_characters( - column_view source_strings, - vector[pair[char_utf8, char_utf8]] chars_table, - filter_type keep, - string_scalar replacement) except + + column_view input, + vector[pair[char_utf8, char_utf8]] characters_to_filter, + filter_type keep_characters, + string_scalar replacement) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd index c0053391328..2fb49c2a830 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type @@ -9,5 +9,5 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] wrap( - column_view source_strings, - size_type width) except + + column_view input, + size_type width) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd index 0c8fe1060ac..a2654eaab16 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd @@ -1,14 +1,14 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. - from libc.stdint cimport uint8_t, uint16_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \ @@ -17,17 +17,19 @@ cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \ cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ "cudf::strings::udf" nogil: - cdef int get_cuda_build_version() except + - cdef unique_ptr[device_buffer] to_string_view_array(column_view) except + + cdef int get_cuda_build_version() except +libcudf_exception_handler + cdef unique_ptr[device_buffer] to_string_view_array( + column_view + ) except +libcudf_exception_handler cdef unique_ptr[column] column_from_udf_string_array( udf_string* strings, size_type size, - ) except + + ) except +libcudf_exception_handler cdef void free_udf_string_array( udf_string* strings, size_type size - ) except + + ) except +libcudf_exception_handler cdef extern from "cudf/strings/detail/char_tables.hpp" namespace \ "cudf::strings::detail" nogil: - cdef const uint8_t* get_character_flags_table() except + - cdef const uint16_t* get_character_cases_table() except + - cdef const void* get_special_case_mapping_table() except + + cdef const uint8_t* get_character_flags_table() except +libcudf_exception_handler + cdef const uint16_t* get_character_cases_table() except +libcudf_exception_handler + cdef const void* get_special_case_mapping_table() except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/table/table.pxd b/python/pylibcudf/pylibcudf/libcudf/table/table.pxd index 654c29b083a..b65644dd131 100644 --- a/python/pylibcudf/pylibcudf/libcudf/table/table.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/table/table.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table_view cimport mutable_table_view, table_view from pylibcudf.libcudf.types cimport size_type @@ -9,10 +9,10 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil: cdef cppclass table: - table(const table&) except + - table(table_view) except + - size_type num_columns() except + - size_type num_rows() except + - table_view view() except + - mutable_table_view mutable_view() except + - vector[unique_ptr[column]] release() except + + table(const table&) except +libcudf_exception_handler + table(table_view) except +libcudf_exception_handler + size_type num_columns() except +libcudf_exception_handler + size_type num_rows() except +libcudf_exception_handler + table_view view() except +libcudf_exception_handler + mutable_table_view mutable_view() except +libcudf_exception_handler + vector[unique_ptr[column]] release() except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd b/python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd index 3af2f6a6c2c..eacfa0420ef 100644 --- a/python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column_view cimport ( column_view, mutable_column_view, @@ -10,16 +10,22 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil: cdef cppclass table_view: - table_view() except + - table_view(const vector[column_view]) except + - column_view column(size_type column_index) except + - size_type num_columns() except + - size_type num_rows() except + - table_view select(vector[size_type] column_indices) except + + table_view() except +libcudf_exception_handler + table_view(const vector[column_view]) except +libcudf_exception_handler + column_view column(size_type column_index) except +libcudf_exception_handler + size_type num_columns() except +libcudf_exception_handler + size_type num_rows() except +libcudf_exception_handler + table_view select( + vector[size_type] column_indices + ) except +libcudf_exception_handler cdef cppclass mutable_table_view: - mutable_table_view() except + - mutable_table_view(const vector[mutable_column_view]) except + - mutable_column_view column(size_type column_index) except + - size_type num_columns() except + - size_type num_rows() except + + mutable_table_view() except +libcudf_exception_handler + mutable_table_view( + const vector[mutable_column_view] + ) except +libcudf_exception_handler + mutable_column_view column( + size_type column_index + ) except +libcudf_exception_handler + size_type num_columns() except +libcudf_exception_handler + size_type num_rows() except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd index 38298a7c1f1..78ee7b4b0e5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd @@ -1,9 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.string cimport string +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.expressions cimport expression @@ -11,39 +11,44 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: cdef pair[unique_ptr[device_buffer], size_type] bools_to_mask ( column_view input - ) except + + ) except +libcudf_exception_handler cdef unique_ptr[column] mask_to_bools ( bitmask_type* bitmask, size_type begin_bit, size_type end_bit - ) except + + ) except +libcudf_exception_handler cdef pair[unique_ptr[device_buffer], size_type] nans_to_nulls( column_view input - ) except + + ) except +libcudf_exception_handler + + cdef unique_ptr[column] compute_column( + table_view table, + expression expr + ) except +libcudf_exception_handler cdef unique_ptr[column] transform( column_view input, string unary_udf, data_type output_type, bool is_ptx - ) except + + ) except +libcudf_exception_handler cdef pair[unique_ptr[table], unique_ptr[column]] encode( table_view input - ) except + + ) except +libcudf_exception_handler cdef pair[unique_ptr[column], table_view] one_hot_encode( column_view input_column, column_view categories - ) + ) except + cdef unique_ptr[column] compute_column( const table_view table, const expression& expr - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/transpose.pxd b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd index 9c0e3c073b0..fde49afd99c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/transpose.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table_view cimport table_view @@ -12,4 +12,4 @@ cdef extern from "cudf/transpose.hpp" namespace "cudf" nogil: table_view ] transpose( table_view input_table - ) except + + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/types.pxd b/python/pylibcudf/pylibcudf/libcudf/types.pxd index eabae68bc90..3281c230aa0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/types.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t, uint32_t from libcpp cimport bool +from pylibcudf.exception_handler cimport libcudf_exception_handler cdef extern from "cudf/types.hpp" namespace "cudf" nogil: @@ -70,24 +70,25 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: TIMESTAMP_MILLISECONDS TIMESTAMP_MICROSECONDS TIMESTAMP_NANOSECONDS - DICTIONARY32 - STRING - LIST - STRUCT - NUM_TYPE_IDS + DURATION_DAYS DURATION_SECONDS DURATION_MILLISECONDS DURATION_MICROSECONDS DURATION_NANOSECONDS + DICTIONARY32 + STRING + LIST DECIMAL32 DECIMAL64 DECIMAL128 + STRUCT + NUM_TYPE_IDS cdef cppclass data_type: - data_type() except + - data_type(const data_type&) except + - data_type(type_id id) except + - data_type(type_id id, int32_t scale) except + + data_type() except +libcudf_exception_handler + data_type(const data_type&) except +libcudf_exception_handler + data_type(type_id id) except +libcudf_exception_handler + data_type(type_id id, int32_t scale) except +libcudf_exception_handler type_id id() noexcept int32_t scale() noexcept bool operator==(const data_type&, const data_type&) noexcept @@ -99,4 +100,4 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: MIDPOINT NEAREST - cdef size_type size_of(data_type t) except + + cdef size_type size_of(data_type t) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/unary.pxd b/python/pylibcudf/pylibcudf/libcudf/unary.pxd index 887f8c7fca4..4666012623e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/unary.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/unary.pxd @@ -1,8 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - from libc.stdint cimport int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -36,13 +36,21 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil: cdef extern unique_ptr[column] unary_operation( column_view input, - unary_operator op) except + + unary_operator op) except +libcudf_exception_handler - cdef extern unique_ptr[column] is_null(column_view input) except + - cdef extern unique_ptr[column] is_valid(column_view input) except + + cdef extern unique_ptr[column] is_null( + column_view input + ) except +libcudf_exception_handler + cdef extern unique_ptr[column] is_valid( + column_view input + ) except +libcudf_exception_handler cdef extern unique_ptr[column] cast( column_view input, - data_type out_type) except + + data_type out_type) except +libcudf_exception_handler cdef extern bool is_supported_cast(data_type from_, data_type to) noexcept - cdef extern unique_ptr[column] is_nan(column_view input) except + - cdef extern unique_ptr[column] is_not_nan(column_view input) except + + cdef extern unique_ptr[column] is_nan( + column_view input + ) except +libcudf_exception_handler + cdef extern unique_ptr[column] is_not_nan( + column_view input + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd deleted file mode 100644 index 7e591e96373..00000000000 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -from libcpp.vector cimport vector - - -cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil: - cdef cppclass host_span[T]: - host_span() except + - host_span(vector[T]) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd new file mode 100644 index 00000000000..8024ce146ae --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler + + +cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil: + cdef cppclass host_span[T]: + host_span() except +libcudf_exception_handler + host_span(vector[T]) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd index 69765e44274..93f13a7e11f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport data_type @@ -9,6 +9,7 @@ cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil: cdef bool is_relationally_comparable(data_type) cdef bool is_equality_comparable(data_type) cdef bool is_numeric(data_type) + cdef bool is_numeric_not_bool(data_type) cdef bool is_index_type(data_type) cdef bool is_unsigned(data_type) cdef bool is_integral(data_type) diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd index fbeb6e9db90..c3e3232b5cc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.types cimport type_id diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd deleted file mode 100644 index 558299501d6..00000000000 --- a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t, int64_t -from pylibcudf.libcudf.types cimport int128 - - -cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil: - # cython type stub to help resolve to numeric::decimal64 - ctypedef int64_t decimal64 - # cython type stub to help resolve to numeric::decimal32 - ctypedef int64_t decimal32 - # cython type stub to help resolve to numeric::decimal128 - ctypedef int128 decimal128 - - cdef cppclass scale_type: - scale_type(int32_t) diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd index 7c648425eb5..c9c960d0a79 100644 --- a/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd @@ -1,9 +1,10 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport int64_t +from libc.stdint cimport int32_t, int64_t cdef extern from "cudf/wrappers/durations.hpp" namespace "cudf" nogil: + ctypedef int32_t duration_D ctypedef int64_t duration_s ctypedef int64_t duration_ms ctypedef int64_t duration_us diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd index 50d37fd0a68..5dcd144529d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd @@ -1,9 +1,10 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport int64_t +from libc.stdint cimport int32_t, int64_t cdef extern from "cudf/wrappers/timestamps.hpp" namespace "cudf" nogil: + ctypedef int32_t timestamp_D ctypedef int64_t timestamp_s ctypedef int64_t timestamp_ms ctypedef int64_t timestamp_us diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd index e7d006e6e2e..10c1c26e24e 100644 --- a/python/pylibcudf/pylibcudf/lists.pxd +++ b/python/pylibcudf/pylibcudf/lists.pxd @@ -1,7 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool -from pylibcudf.libcudf.types cimport null_order, size_type +from pylibcudf.libcudf.types cimport ( + nan_equality, null_equality, null_order, order, size_type +) +from pylibcudf.libcudf.lists.combine cimport concatenate_null_policy +from pylibcudf.libcudf.lists.contains cimport duplicate_find_option from .column cimport Column from .scalar cimport Scalar @@ -19,13 +23,13 @@ cpdef Table explode_outer(Table, size_type explode_column_idx) cpdef Column concatenate_rows(Table) -cpdef Column concatenate_list_elements(Column, bool dropna) +cpdef Column concatenate_list_elements(Column, concatenate_null_policy null_policy) cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) -cpdef Column index_of(Column, ColumnOrScalar, bool) +cpdef Column index_of(Column, ColumnOrScalar, duplicate_find_option) cpdef Column reverse(Column) @@ -37,16 +41,24 @@ cpdef Column count_elements(Column) cpdef Column sequences(Column, Column, Column steps = *) -cpdef Column sort_lists(Column, bool, null_order, bool stable = *) +cpdef Column sort_lists(Column, order, null_order, bool stable = *) -cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column difference_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column have_overlap( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column intersect_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) -cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) +cpdef Column union_distinct( + Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* +) cpdef Column apply_boolean_mask(Column, Column) -cpdef Column distinct(Column, bool, bool) +cpdef Column distinct(Column, null_equality, nan_equality) diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi new file mode 100644 index 00000000000..dff6c400638 --- /dev/null +++ b/python/pylibcudf/pylibcudf/lists.pyi @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table +from pylibcudf.types import NanEquality, NullEquality, NullOrder, Order + +class ConcatenateNullPolicy(IntEnum): + IGNORE = ... + NULLIFY_OUTPUT_ROW = ... + +class DuplicateFindOption(IntEnum): + FIND_FIRST = ... + FIND_LAST = ... + +def explode_outer(input: Table, explode_column_idx: int) -> Table: ... +def concatenate_rows(input: Table) -> Column: ... +def concatenate_list_elements( + input: Column, null_policy: ConcatenateNullPolicy +) -> Column: ... +def contains(input: Column, search_key: Column | Scalar) -> Column: ... +def contains_nulls(input: Column) -> Column: ... +def index_of( + input: Column, + search_key: Column | Scalar, + find_option: DuplicateFindOption, +) -> Column: ... +def reverse(input: Column) -> Column: ... +def segmented_gather(input: Column, gather_map_list: Column) -> Column: ... +def extract_list_element(input: Column, index: Column | int) -> Column: ... +def count_elements(input: Column) -> Column: ... +def sequences( + starts: Column, sizes: Column, steps: Column | None = None +) -> Column: ... +def sort_lists( + input: Column, + sort_order: Order, + na_position: NullOrder, + stable: bool = False, +) -> Column: ... +def difference_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def have_overlap( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def intersect_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def union_distinct( + lhs: Column, + rhs: Column, + nulls_equal: NullEquality = NullEquality.EQUAL, + nans_equal: NanEquality = NanEquality.ALL_EQUAL, +) -> Column: ... +def apply_boolean_mask(input: Column, boolean_mask: Column) -> Column: ... +def distinct( + input: Column, nulls_equal: NullEquality, nans_equal: NanEquality +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index 6f82124d06e..ccc56eaa520 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -42,10 +42,35 @@ from pylibcudf.libcudf.types cimport ( ) from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType +from pylibcudf.libcudf.lists.combine import concatenate_null_policy as ConcatenateNullPolicy # no-cython-lint +from pylibcudf.libcudf.lists.contains import duplicate_find_option as DuplicateFindOption # no-cython-lint + from .column cimport Column, ListColumnView from .scalar cimport Scalar from .table cimport Table +__all__ = [ + "ConcatenateNullPolicy", + "DuplicateFindOption", + "apply_boolean_mask", + "concatenate_list_elements", + "concatenate_rows", + "contains", + "contains_nulls", + "count_elements", + "difference_distinct", + "distinct", + "explode_outer", + "extract_list_element", + "have_overlap", + "index_of", + "intersect_distinct", + "reverse", + "segmented_gather", + "sequences", + "sort_lists", + "union_distinct", +] cpdef Table explode_outer(Table input, size_type explode_column_idx): """Explode a column of lists into rows. @@ -69,7 +94,7 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx)) + c_result = cpp_explode.explode_outer(input.view(), explode_column_idx) return Table.from_libcudf(move(c_result)) @@ -92,12 +117,14 @@ cpdef Column concatenate_rows(Table input): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_concatenate_rows(input.view())) + c_result = cpp_concatenate_rows(input.view()) return Column.from_libcudf(move(c_result)) -cpdef Column concatenate_list_elements(Column input, bool dropna): +cpdef Column concatenate_list_elements( + Column input, concatenate_null_policy null_policy +): """Concatenate multiple lists on the same row into a single list. For details, see :cpp:func:`concatenate_list_elements`. @@ -106,27 +133,18 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): ---------- input : Column The input column - dropna : bool - If true, null list elements will be ignored - from concatenation. Otherwise any input null values will result in - the corresponding output row being set to null. + null_policy : ConcatenateNullPolicy + How to treat null list elements. Returns ------- Column A new Column of concatenated list elements """ - cdef concatenate_null_policy null_policy = ( - concatenate_null_policy.IGNORE if dropna - else concatenate_null_policy.NULLIFY_OUTPUT_ROW - ) cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_concatenate_list_elements( - input.view(), - null_policy, - )) + c_result = cpp_concatenate_list_elements(input.view(), null_policy) return Column.from_libcudf(move(c_result)) @@ -161,12 +179,12 @@ cpdef Column contains(Column input, ColumnOrScalar search_key): raise TypeError("Must pass a Column or Scalar") with nogil: - c_result = move(cpp_contains.contains( + c_result = cpp_contains.contains( list_view.view(), search_key.view() if ColumnOrScalar is Column else dereference( search_key.get() ), - )) + ) return Column.from_libcudf(move(c_result)) @@ -190,11 +208,13 @@ cpdef Column contains_nulls(Column input): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_contains.contains_nulls(list_view.view())) + c_result = cpp_contains.contains_nulls(list_view.view()) return Column.from_libcudf(move(c_result)) -cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option): +cpdef Column index_of( + Column input, ColumnOrScalar search_key, duplicate_find_option find_option +): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -210,9 +230,8 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o The input column. search_key : Union[Column, Scalar] The search key. - find_first_option : bool - If true, index_of returns the first match. - Otherwise the last match is returned. + find_option : DuplicateFindOption + Which match to return if there are duplicates. Returns ------- @@ -223,19 +242,14 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o """ cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef cpp_contains.duplicate_find_option find_option = ( - cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option - else cpp_contains.duplicate_find_option.FIND_LAST - ) - with nogil: - c_result = move(cpp_contains.index_of( + c_result = cpp_contains.index_of( list_view.view(), search_key.view() if ColumnOrScalar is Column else dereference( search_key.get() ), find_option, - )) + ) return Column.from_libcudf(move(c_result)) @@ -258,9 +272,7 @@ cpdef Column reverse(Column input): cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_reverse.reverse( - list_view.view(), - )) + c_result = cpp_reverse.reverse(list_view.view()) return Column.from_libcudf(move(c_result)) @@ -288,10 +300,10 @@ cpdef Column segmented_gather(Column input, Column gather_map_list): cdef ListColumnView list_view2 = gather_map_list.list_view() with nogil: - c_result = move(cpp_gather.segmented_gather( + c_result = cpp_gather.segmented_gather( list_view1.view(), list_view2.view(), - )) + ) return Column.from_libcudf(move(c_result)) @@ -316,10 +328,10 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index): cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_extract_list_element( + c_result = cpp_extract_list_element( list_view.view(), index.view() if ColumnOrSizeType is Column else index, - )) + ) return Column.from_libcudf(move(c_result)) @@ -344,7 +356,7 @@ cpdef Column count_elements(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_count_elements(list_view.view())) + c_result = cpp_count_elements(list_view.view()) return Column.from_libcudf(move(c_result)) @@ -373,22 +385,19 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None): if steps is not None: with nogil: - c_result = move(cpp_filling.sequences( + c_result = cpp_filling.sequences( starts.view(), steps.view(), sizes.view(), - )) + ) else: with nogil: - c_result = move(cpp_filling.sequences( - starts.view(), - sizes.view(), - )) + c_result = cpp_filling.sequences(starts.view(), sizes.view()) return Column.from_libcudf(move(c_result)) cpdef Column sort_lists( Column input, - bool ascending, + order sort_order, null_order na_position, bool stable = False ): @@ -400,8 +409,8 @@ cpdef Column sort_lists( ---------- input : Column The input column. - ascending : bool - If true, the sort order is ascending. Otherwise, the sort order is descending. + ascending : Order + Sort order in the list. na_position : NullOrder If na_position equals NullOrder.FIRST, then the null values in the output column are placed first. Otherwise, they are be placed after. @@ -417,31 +426,27 @@ cpdef Column sort_lists( cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef order c_sort_order = ( - order.ASCENDING if ascending else order.DESCENDING - ) - with nogil: if stable: - c_result = move(cpp_stable_sort_lists( + c_result = cpp_stable_sort_lists( list_view.view(), - c_sort_order, + sort_order, na_position, - )) + ) else: - c_result = move(cpp_sort_lists( + c_result = cpp_sort_lists( list_view.view(), - c_sort_order, + sort_order, na_position, - )) + ) return Column.from_libcudf(move(c_result)) cpdef Column difference_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -454,11 +459,10 @@ cpdef Column difference_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -469,28 +473,21 @@ cpdef Column difference_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: - c_result = move(cpp_set_operations.difference_distinct( + c_result = cpp_set_operations.difference_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, - )) + nulls_equal, + nans_equal, + ) return Column.from_libcudf(move(c_result)) cpdef Column have_overlap( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Check if lists at each row of the given lists columns overlap. @@ -502,11 +499,10 @@ cpdef Column have_overlap( The input lists column for one side. rhs : Column The input lists column for the other side. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -517,28 +513,21 @@ cpdef Column have_overlap( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: - c_result = move(cpp_set_operations.have_overlap( + c_result = cpp_set_operations.have_overlap( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, - )) + nulls_equal, + nans_equal, + ) return Column.from_libcudf(move(c_result)) cpdef Column intersect_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a lists column of distinct elements common to two input lists columns. @@ -550,11 +539,10 @@ cpdef Column intersect_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -565,28 +553,21 @@ cpdef Column intersect_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: - c_result = move(cpp_set_operations.intersect_distinct( + c_result = cpp_set_operations.intersect_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, - )) + nulls_equal, + nans_equal, + ) return Column.from_libcudf(move(c_result)) cpdef Column union_distinct( Column lhs, Column rhs, - bool nulls_equal=True, - bool nans_equal=True + null_equality nulls_equal=null_equality.EQUAL, + nan_equality nans_equal=nan_equality.ALL_EQUAL, ): """Create a lists column of distinct elements found in either of two input lists columns. @@ -599,11 +580,10 @@ cpdef Column union_distinct( The input lists column of elements that may be included. rhs : Column The input lists column of elements to exclude. - nulls_equal : bool, default True - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool, default True - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality, default EQUAL + Are nulls considered equal. + nans_equal : NanEquality, default ALL_EQUAL + Are nans considered equal. Returns ------- @@ -614,20 +594,13 @@ cpdef Column union_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: - c_result = move(cpp_set_operations.union_distinct( + c_result = cpp_set_operations.union_distinct( lhs_view.view(), rhs_view.view(), - c_nulls_equal, - c_nans_equal, - )) + nulls_equal, + nans_equal, + ) return Column.from_libcudf(move(c_result)) @@ -652,14 +625,14 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask): cdef ListColumnView list_view = input.list_view() cdef ListColumnView mask_view = boolean_mask.list_view() with nogil: - c_result = move(cpp_apply_boolean_mask( + c_result = cpp_apply_boolean_mask( list_view.view(), mask_view.view(), - )) + ) return Column.from_libcudf(move(c_result)) -cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): +cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans_equal): """Create a new list column without duplicate elements in each list. For details, see :cpp:func:`distinct`. @@ -668,11 +641,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): ---------- input : Column The input column. - nulls_equal : bool - If true, null elements are considered equal. Otherwise, unequal. - nans_equal : bool - If true, libcudf will treat nan elements from {-nan, +nan} - as equal. Otherwise, unequal. Otherwise, unequal. + nulls_equal : NullEquality + Are nulls considered equal. + nans_equal : NanEquality + Are nans considered equal. Returns ------- @@ -682,17 +654,10 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL - ) - with nogil: - c_result = move(cpp_distinct( + c_result = cpp_distinct( list_view.view(), - c_nulls_equal, - c_nans_equal, - )) + nulls_equal, + nans_equal, + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/merge.pyi b/python/pylibcudf/pylibcudf/merge.pyi new file mode 100644 index 00000000000..b18eb01f8a2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/merge.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, Order + +def merge( + tables_to_merge: list[Table], + key_cols: list[int], + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index 6d707b67449..c051cdc0c66 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order, size_type from .table cimport Table +__all__ = ["merge"] cpdef Table merge ( list tables_to_merge, @@ -47,12 +48,10 @@ cpdef Table merge ( cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_merge.merge( - c_tables_to_merge, - c_key_cols, - c_column_order, - c_null_precedence, - ) + c_result = cpp_merge.merge( + c_tables_to_merge, + c_key_cols, + c_column_order, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd index ab5c0080312..9bdfaee2842 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/null_mask.pxd @@ -2,7 +2,7 @@ from pylibcudf.libcudf.types cimport mask_state, size_type -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .column cimport Column diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi new file mode 100644 index 00000000000..1a6d96a0822 --- /dev/null +++ b/python/pylibcudf/pylibcudf/null_mask.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from rmm.pylibrmm.device_buffer import DeviceBuffer + +from pylibcudf.column import Column +from pylibcudf.types import MaskState + +def copy_bitmask(col: Column) -> DeviceBuffer: ... +def bitmask_allocation_size_bytes(number_of_bits: int) -> int: ... +def create_null_mask( + size: int, state: MaskState = MaskState.UNINITIALIZED +) -> DeviceBuffer: ... +def bitmask_and(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... +def bitmask_or(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index 5bdde06f21f..adc264e9af6 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -6,13 +6,21 @@ from libcpp.utility cimport move from pylibcudf.libcudf cimport null_mask as cpp_null_mask from pylibcudf.libcudf.types cimport mask_state, size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint from .column cimport Column from .table cimport Table +__all__ = [ + "bitmask_allocation_size_bytes", + "bitmask_and", + "bitmask_or", + "copy_bitmask", + "create_null_mask", +] cdef DeviceBuffer buffer_to_python(device_buffer buf): return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf))) @@ -31,13 +39,13 @@ cpdef DeviceBuffer copy_bitmask(Column col): Returns ------- rmm.DeviceBuffer - A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer`` - if ``col`` is not nullable + A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty + ``DeviceBuffer`` if ``col`` is not nullable """ cdef device_buffer db with nogil: - db = move(cpp_null_mask.copy_bitmask(col.view())) + db = cpp_null_mask.copy_bitmask(col.view()) return buffer_to_python(move(db)) @@ -89,7 +97,7 @@ cpdef DeviceBuffer create_null_mask( cdef device_buffer db with nogil: - db = move(cpp_null_mask.create_null_mask(size, state)) + db = cpp_null_mask.create_null_mask(size, state) return buffer_to_python(move(db)) @@ -113,7 +121,7 @@ cpdef tuple bitmask_and(list columns): cdef pair[device_buffer, size_type] c_result with nogil: - c_result = move(cpp_null_mask.bitmask_and(c_table.view())) + c_result = cpp_null_mask.bitmask_and(c_table.view()) return buffer_to_python(move(c_result.first)), c_result.second @@ -137,6 +145,6 @@ cpdef tuple bitmask_or(list columns): cdef pair[device_buffer, size_type] c_result with nogil: - c_result = move(cpp_null_mask.bitmask_or(c_table.view())) + c_result = cpp_null_mask.bitmask_or(c_table.view()) return buffer_to_python(move(c_result.first)), c_result.second diff --git a/cpp/cmake/thirdparty/get_cufile.cmake b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt similarity index 50% rename from cpp/cmake/thirdparty/get_cufile.cmake rename to python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index bfdff3a99ff..93e3fb15259 100644 --- a/cpp/cmake/thirdparty/get_cufile.cmake +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,21 +12,14 @@ # the License. # ============================================================================= -# This function finds nvcomp and sets any additional necessary environment variables. -function(find_and_configure_cufile) +set(cython_sources + edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx + replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx +) - list(APPEND CMAKE_MODULE_PATH ${CUDF_SOURCE_DIR}/cmake/Modules) - rapids_find_package(cuFile) - - if(cuFile_FOUND AND NOT BUILD_SHARED_LIBS) - include("${rapids-cmake-dir}/export/find_package_file.cmake") - rapids_export_find_package_file( - BUILD "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET cudf-exports - ) - rapids_export_find_package_file( - INSTALL "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET cudf-exports - ) - endif() -endfunction() - -find_and_configure_cufile() +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd new file mode 100644 index 00000000000..ef837167eb9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport ( + byte_pair_encode, + edit_distance, + generate_ngrams, + jaccard, + minhash, + ngrams_tokenize, + normalize, + replace, + stemmer, + subword_tokenize, + tokenize, +) + +__all__ = [ + "edit_distance", + "generate_ngrams", + "jaccard", + "minhash", + "byte_pair_encode" + "ngrams_tokenize", + "normalize", + "replace", + "stemmer", + "subword_tokenize", + "tokenize", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py new file mode 100644 index 00000000000..4f125d3a733 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import ( + byte_pair_encode, + edit_distance, + generate_ngrams, + jaccard, + minhash, + ngrams_tokenize, + normalize, + replace, + stemmer, + subword_tokenize, + tokenize, +) + +__all__ = [ + "edit_distance", + "generate_ngrams", + "jaccard", + "minhash", + "byte_pair_encode", + "ngrams_tokenize", + "normalize", + "replace", + "stemmer", + "subword_tokenize", + "tokenize", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd new file mode 100644 index 00000000000..e4b93e96b9d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.byte_pair_encode cimport bpe_merge_pairs +from pylibcudf.scalar cimport Scalar + + +cdef class BPEMergePairs: + cdef unique_ptr[bpe_merge_pairs] c_obj + +cpdef Column byte_pair_encoding( + Column input, + BPEMergePairs merge_pairs, + Scalar separator=* +) diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi new file mode 100644 index 00000000000..ca39aa16d7e --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class BPEMergePairs: + def __init__(self, merge_pairs: Column): ... + +def byte_pair_encoding( + input: Column, merge_pairs: BPEMergePairs, separator: Scalar | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx new file mode 100644 index 00000000000..7565b21084f --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.byte_pair_encode cimport ( + byte_pair_encoding as cpp_byte_pair_encoding, + load_merge_pairs as cpp_load_merge_pairs, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.scalar cimport Scalar + +__all__ = ["BPEMergePairs", "byte_pair_encoding"] + +cdef class BPEMergePairs: + """The table of merge pairs for the BPE encoder. + + For details, see :cpp:class:`cudf::nvtext::bpe_merge_pairs`. + """ + def __cinit__(self, Column merge_pairs): + cdef column_view c_pairs = merge_pairs.view() + with nogil: + self.c_obj = move(cpp_load_merge_pairs(c_pairs)) + + __hash__ = None + +cpdef Column byte_pair_encoding( + Column input, + BPEMergePairs merge_pairs, + Scalar separator=None +): + """ + Byte pair encode the input strings. + + For details, see cpp:func:`cudf::nvtext::byte_pair_encoding` + + Parameters + ---------- + input : Column + Strings to encode. + merge_pairs : BPEMergePairs + Substrings to rebuild each string on. + separator : Scalar + String used to build the output after encoding. Default is a space. + + Returns + ------- + Column + An encoded column of strings. + """ + cdef unique_ptr[column] c_result + + if separator is None: + separator = Scalar.from_libcudf( + cpp_make_string_scalar(" ".encode()) + ) + + with nogil: + c_result = move( + cpp_byte_pair_encoding( + input.view(), + dereference(merge_pairs.c_obj.get()), + dereference(separator.c_obj.get()), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd new file mode 100644 index 00000000000..446b95afabb --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column edit_distance(Column input, Column targets) + +cpdef Column edit_distance_matrix(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi new file mode 100644 index 00000000000..85bbbb880ee --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def edit_distance(input: Column, targets: Column) -> Column: ... +def edit_distance_matrix(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx new file mode 100644 index 00000000000..eceeaff24e3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.edit_distance cimport ( + edit_distance as cpp_edit_distance, + edit_distance_matrix as cpp_edit_distance_matrix, +) + +__all__ = ["edit_distance", "edit_distance_matrix"] + +cpdef Column edit_distance(Column input, Column targets): + """ + Returns the edit distance between individual strings in two strings columns + + For details, see :cpp:func:`edit_distance` + + Parameters + ---------- + input : Column + Input strings + targets : Column + Strings to compute edit distance against + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef column_view c_targets = targets.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_edit_distance(c_strings, c_targets) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column edit_distance_matrix(Column input): + """ + Returns the edit distance between all strings in the input strings column + + For details, see :cpp:func:`edit_distance_matrix` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_edit_distance_matrix(c_strings) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd new file mode 100644 index 00000000000..f15eb1f25e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator) + +cpdef Column generate_character_ngrams(Column input, size_type ngrams=*) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams=*) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi new file mode 100644 index 00000000000..2757518379d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def generate_ngrams( + input: Column, ngrams: int, separator: Scalar +) -> Column: ... +def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... +def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx new file mode 100644 index 00000000000..521bc0ef4a4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( + generate_character_ngrams as cpp_generate_character_ngrams, + generate_ngrams as cpp_generate_ngrams, + hash_character_ngrams as cpp_hash_character_ngrams, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +__all__ = [ + "generate_ngrams", + "generate_character_ngrams", + "hash_character_ngrams", +] + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): + """ + Returns a single column of strings by generating ngrams from a strings column. + + For details, see :cpp:func:`generate_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + separator : Scalar + The string to use for separating ngram tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef column_view c_strings = input.view() + cdef const string_scalar* c_separator = separator.c_obj.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_generate_ngrams( + c_strings, + ngrams, + c_separator[0] + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of ngrams of characters within each string. + + For details, see :cpp:func:`generate_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of strings + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_generate_character_ngrams( + c_strings, + ngrams, + ) + return Column.from_libcudf(move(c_result)) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of hash values of the characters in each string + + For details, see :cpp:func:`hash_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of hash values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_hash_character_ngrams( + c_strings, + ngrams, + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd new file mode 100644 index 00000000000..a4d4a15335b --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column jaccard_index(Column input1, Column input2, size_type width) diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi new file mode 100644 index 00000000000..18263c5c8fd --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def jaccard_index(input1: Column, input2: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx new file mode 100644 index 00000000000..90cace088f7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx @@ -0,0 +1,46 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.jaccard cimport ( + jaccard_index as cpp_jaccard_index, +) +from pylibcudf.libcudf.types cimport size_type + +__all__ = ["jaccard_index"] + +cpdef Column jaccard_index(Column input1, Column input2, size_type width): + """ + Returns the Jaccard similarity between individual rows in two strings columns. + + For details, see :cpp:func:`jaccard_index` + + Parameters + ---------- + input1 : Column + Input strings column + input2 : Column + Input strings column + width : size_type + The ngram number to generate + + Returns + ------- + Column + Index calculation values + """ + cdef column_view c_input1 = input1.view() + cdef column_view c_input2 = input2.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_jaccard_index( + c_input1, + c_input2, + width + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd new file mode 100644 index 00000000000..6b544282f44 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +) + +cpdef Column word_minhash(Column input, Column seeds) + +cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi new file mode 100644 index 00000000000..a2d9b6364f7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def minhash( + input: Column, seeds: Column | Scalar, width: int = 4 +) -> Column: ... +def minhash64( + input: Column, seeds: Column | Scalar, width: int = 4 +) -> Column: ... +def word_minhash(input: Column, seeds: Column) -> Column: ... +def word_minhash64(input: Column, seeds: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx new file mode 100644 index 00000000000..5448cc6de9b --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -0,0 +1,261 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.minhash cimport ( + minhash as cpp_minhash, + minhash64 as cpp_minhash64, + minhash64_permuted as cpp_minhash64_permuted, + minhash_permuted as cpp_minhash_permuted, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, +) +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference +import warnings + +__all__ = [ + "minhash", + "minhash64", + "word_minhash", + "word_minhash64", +] + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash_permuted.", + FutureWarning + ) + + cdef unique_ptr[column] c_result + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = cpp_minhash( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint32_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash64_permuted.", + FutureWarning + ) + + cdef unique_ptr[column] c_result + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = cpp_minhash64( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint64_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash64_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`word_minhash`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_word_minhash( + input.view(), + seeds.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash64(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm though + only the first 64-bits of the hash are used in computing the output. + + For details, see :cpp:func:`word_minhash64`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_word_minhash64( + input.view(), + seeds.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd new file mode 100644 index 00000000000..4f791ba1ee9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column ngrams_tokenize( + Column input, + size_type ngrams, + Scalar delimiter, + Scalar separator +) diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi new file mode 100644 index 00000000000..224640ed44d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def ngrams_tokenize( + input: Column, ngrams: int, delimiter: Scalar, separator: Scalar +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx new file mode 100644 index 00000000000..771c7c019fc --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport ( + ngrams_tokenize as cpp_ngrams_tokenize, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +__all__ = ["ngrams_tokenize"] + +cpdef Column ngrams_tokenize( + Column input, + size_type ngrams, + Scalar delimiter, + Scalar separator +): + """ + Returns a single column of strings by tokenizing the input strings column + and then producing ngrams of each string. + + For details, see :cpp:func:`ngrams_tokenize` + + Parameters + ---------- + input : Column + Input strings + ngrams : size_type + The ngram number to generate + delimiter : Scalar + UTF-8 characters used to separate each string into tokens. + An empty string will separate tokens using whitespace. + separator : Scalar + The string to use for separating ngram tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_ngrams_tokenize( + input.view(), + ngrams, + dereference(delimiter.get()), + dereference(separator.get()), + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd new file mode 100644 index 00000000000..90676145afa --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from pylibcudf.column cimport Column + + +cpdef Column normalize_spaces(Column input) + +cpdef Column normalize_characters(Column input, bool do_lower_case) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi new file mode 100644 index 00000000000..1d90a5a8960 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def normalize_spaces(input: Column) -> Column: ... +def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx new file mode 100644 index 00000000000..b259ccaefa6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -0,0 +1,65 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.normalize cimport ( + normalize_characters as cpp_normalize_characters, + normalize_spaces as cpp_normalize_spaces, +) + +__all__ = ["normalize_characters", "normalize_spaces"] + +cpdef Column normalize_spaces(Column input): + """ + Returns a new strings column by normalizing the whitespace in + each string in the input column. + + For details, see :cpp:func:`normalize_spaces` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New strings columns of normalized strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize_spaces(input.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column normalize_characters(Column input, bool do_lower_case): + """ + Normalizes strings characters for tokenizing. + + For details, see :cpp:func:`normalize_characters` + + Parameters + ---------- + input : Column + Input strings + do_lower_case : bool + If true, upper-case characters are converted to lower-case + and accents are stripped from those characters. If false, + accented and upper-case characters are not transformed. + + Returns + ------- + Column + Normalized strings column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize_characters(input.view(), do_lower_case) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/nvtext/replace.pxd new file mode 100644 index 00000000000..624f90e7486 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pxd @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column replace_tokens( + Column input, + Column targets, + Column replacements, + Scalar delimiter=*, +) + +cpdef Column filter_tokens( + Column input, + size_type min_token_length, + Scalar replacement=*, + Scalar delimiter=* +) diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyi b/python/pylibcudf/pylibcudf/nvtext/replace.pyi new file mode 100644 index 00000000000..1f1ac72ce7c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyi @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def replace_tokens( + input: Column, + targets: Column, + replacements: Column, + delimiter: Scalar | None = None, +) -> Column: ... +def filter_tokens( + input: Column, + min_token_length: int, + replacement: Scalar | None = None, + delimiter: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx new file mode 100644 index 00000000000..a27592fb434 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.replace cimport ( + filter_tokens as cpp_filter_tokens, + replace_tokens as cpp_replace_tokens, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +__all__ = ["filter_tokens", "replace_tokens"] + +cpdef Column replace_tokens( + Column input, + Column targets, + Column replacements, + Scalar delimiter=None, +): + """ + Replaces specified tokens with corresponding replacement strings. + + For details, see :cpp:func:`replace_tokens` + + Parameters + ---------- + input : Column + Strings column to replace + targets : Column + Strings to compare against tokens found in ``input`` + replacements : Column + Replacement strings for each string in ``targets`` + delimiter : Scalar, optional + Characters used to separate each string into tokens. + The default of empty string will identify tokens using whitespace. + + Returns + ------- + Column + New strings column with replaced strings + """ + cdef unique_ptr[column] c_result + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + with nogil: + c_result = cpp_replace_tokens( + input.view(), + targets.view(), + replacements.view(), + dereference(delimiter.get()), + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column filter_tokens( + Column input, + size_type min_token_length, + Scalar replacement=None, + Scalar delimiter=None +): + """ + Removes tokens whose lengths are less than a specified number of characters. + + For details, see :cpp:func:`filter_tokens` + + Parameters + ---------- + input : Column + Strings column to replace + min_token_length : size_type + The minimum number of characters to retain a + token in the output string + replacement : Scalar, optional + Optional replacement string to be used in place of removed tokens + delimiter : Scalar, optional + Characters used to separate each string into tokens. + The default of empty string will identify tokens using whitespace. + Returns + ------- + Column + New strings column of filtered strings + """ + cdef unique_ptr[column] c_result + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + if replacement is None: + replacement = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_filter_tokens( + input.view(), + min_token_length, + dereference(replacement.get()), + dereference(delimiter.get()), + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd new file mode 100644 index 00000000000..48762efc01f --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.stemmer cimport letter_type +from pylibcudf.libcudf.types cimport size_type + +ctypedef fused ColumnOrSize: + Column + size_type + +cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices) + +cpdef Column porter_stemmer_measure(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi new file mode 100644 index 00000000000..d6ba1d189bd --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def is_letter( + input: Column, check_vowels: bool, indices: Column | int +) -> Column: ... +def porter_stemmer_measure(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx new file mode 100644 index 00000000000..c9e4f1274e4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx @@ -0,0 +1,77 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.stemmer cimport ( + is_letter as cpp_is_letter, + letter_type, + porter_stemmer_measure as cpp_porter_stemmer_measure, +) +from pylibcudf.libcudf.types cimport size_type + +__all__ = ["is_letter", "porter_stemmer_measure"] + +cpdef Column is_letter( + Column input, + bool check_vowels, + ColumnOrSize indices +): + """ + Returns boolean column indicating if the character + or characters at the provided character index or + indices (respectively) are consonants or vowels + + For details, see :cpp:func:`is_letter` + + Parameters + ---------- + input : Column + Input strings + check_vowels : bool + If true, the check is for vowels. Otherwise the check is + for consonants. + indices : Union[Column, size_type] + The character position(s) to check in each string + + Returns + ------- + Column + New boolean column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_is_letter( + input.view(), + letter_type.VOWEL if check_vowels else letter_type.CONSONANT, + indices if ColumnOrSize is size_type else indices.view() + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column porter_stemmer_measure(Column input): + """ + Returns the Porter Stemmer measurements of a strings column. + + For details, see :cpp:func:`porter_stemmer_measure` + + Parameters + ---------- + input : Column + Strings column of words to measure + + Returns + ------- + Column + New column of measure values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_porter_stemmer_measure(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd new file mode 100644 index 00000000000..091c7b897ac --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.subword_tokenize cimport hashed_vocabulary + + +cdef class HashedVocabulary: + cdef unique_ptr[hashed_vocabulary] c_obj + +cpdef tuple[Column, Column, Column] subword_tokenize( + Column input, + HashedVocabulary vocabulary_table, + uint32_t max_sequence_length, + uint32_t stride, + bool do_lower_case, + bool do_truncate, +) diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi new file mode 100644 index 00000000000..f6618e296b1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class HashedVocabulary: + def __init__(self, hash_file: str): ... + +def subword_tokenize( + input: Column, + vocabulary_table: HashedVocabulary, + max_sequence_length: int, + stride: int, + do_lower_case: bool, + do_truncate: bool, +) -> tuple[Column, Column, Column]: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx new file mode 100644 index 00000000000..14fb6f5fe1e --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx @@ -0,0 +1,87 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libc.stdint cimport uint32_t +from libcpp cimport bool +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( + load_vocabulary_file as cpp_load_vocabulary_file, + move as tr_move, + subword_tokenize as cpp_subword_tokenize, + tokenizer_result as cpp_tokenizer_result, +) + +__all__ = ["HashedVocabulary", "subword_tokenize"] + +cdef class HashedVocabulary: + """The vocabulary data for use with the subword_tokenize function. + + For details, see :cpp:class:`cudf::nvtext::hashed_vocabulary`. + """ + def __cinit__(self, hash_file): + cdef string c_hash_file = str(hash_file).encode() + with nogil: + self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) + + __hash__ = None + +cpdef tuple[Column, Column, Column] subword_tokenize( + Column input, + HashedVocabulary vocabulary_table, + uint32_t max_sequence_length, + uint32_t stride, + bool do_lower_case, + bool do_truncate, +): + """ + Creates a tokenizer that cleans the text, splits it into + tokens and returns token-ids from an input vocabulary. + + For details, see cpp:func:`subword_tokenize` + + Parameters + ---------- + input : Column + The input strings to tokenize. + vocabulary_table : HashedVocabulary + The vocabulary table pre-loaded into this object. + max_sequence_length : uint32_t + Limit of the number of token-ids per row in final tensor for each string. + stride : uint32_t + Each row in the output token-ids will replicate + ``max_sequence_length`` - ``stride`` the token-ids + from the previous row, unless it is the first string. + do_lower_case : bool + If true, the tokenizer will convert uppercase characters in the + input stream to lower-case and strip accents from those characters. + If false, accented and uppercase characters are not transformed. + do_truncate : bool + If true, the tokenizer will discard all the token-ids after + ``max_sequence_length`` for each input string. If false, it + will use a new row in the output token-ids to continue + generating the output. + + Returns + ------- + tuple[Column, Column, Column] + A tuple of three columns containing the + tokens, masks, and metadata. + """ + cdef cpp_tokenizer_result c_result + with nogil: + c_result = tr_move( + cpp_subword_tokenize( + input.view(), + dereference(vocabulary_table.c_obj.get()), + max_sequence_length, + stride, + do_lower_case, + do_truncate, + ) + ) + cdef Column tokens = Column.from_libcudf(move(c_result.tensor_token_ids)) + cdef Column masks = Column.from_libcudf(move(c_result.tensor_attention_mask)) + cdef Column metadata = Column.from_libcudf(move(c_result.tensor_metadata)) + return tokens, masks, metadata diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd new file mode 100644 index 00000000000..0aed9702d61 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.tokenize cimport tokenize_vocabulary +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +cdef class TokenizeVocabulary: + cdef unique_ptr[tokenize_vocabulary] c_obj + +cpdef Column tokenize_scalar(Column input, Scalar delimiter=*) + +cpdef Column tokenize_column(Column input, Column delimiters) + +cpdef Column count_tokens_scalar(Column input, Scalar delimiter=*) + +cpdef Column count_tokens_column(Column input, Column delimiters) + +cpdef Column character_tokenize(Column input) + +cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*) + +cpdef Column tokenize_with_vocabulary( + Column input, + TokenizeVocabulary vocabulary, + Scalar delimiter, + size_type default_id=* +) diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi new file mode 100644 index 00000000000..b9aa2393514 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class TokenizeVocabulary: + def __init__(self, vocab: Column): ... + +def tokenize_scalar( + input: Column, delimiter: Scalar | None = None +) -> Column: ... +def tokenize_column(input: Column, delimiters: Column) -> Column: ... +def count_tokens_scalar( + input: Column, delimiter: Scalar | None = None +) -> Column: ... +def count_tokens_column(input: Column, delimiters: Column) -> Column: ... +def character_tokenize(input: Column) -> Column: ... +def detokenize( + input: Column, row_indices: Column, separator: Scalar | None = None +) -> Column: ... +def tokenize_with_vocabulary( + input: Column, + vocabulary: TokenizeVocabulary, + delimiter: Scalar, + default_id: int = -1, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx new file mode 100644 index 00000000000..43d426489b4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx @@ -0,0 +1,274 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.tokenize cimport ( + character_tokenize as cpp_character_tokenize, + count_tokens as cpp_count_tokens, + detokenize as cpp_detokenize, + load_vocabulary as cpp_load_vocabulary, + tokenize as cpp_tokenize, + tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.types cimport size_type + +__all__ = [ + "TokenizeVocabulary", + "character_tokenize", + "count_tokens_column", + "count_tokens_scalar", + "detokenize", + "tokenize_column", + "tokenize_scalar", + "tokenize_with_vocabulary", +] + +cdef class TokenizeVocabulary: + """The Vocabulary object to be used with ``tokenize_with_vocabulary``. + + For details, see :cpp:class:`cudf::nvtext::tokenize_vocabulary`. + """ + def __cinit__(self, Column vocab): + cdef column_view c_vocab = vocab.view() + with nogil: + self.c_obj = move(cpp_load_vocabulary(c_vocab)) + + __hash__ = None + +cpdef Column tokenize_scalar(Column input, Scalar delimiter=None): + """ + Returns a single column of strings by tokenizing the input + strings column using the provided characters as delimiters. + + For details, see cpp:func:`cudf::nvtext::tokenize` + + Parameters + ---------- + input : Column + Strings column to tokenize + delimiter : Scalar + String scalar used to separate individual strings into tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_tokenize( + input.view(), + dereference(delimiter.c_obj.get()), + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column tokenize_column(Column input, Column delimiters): + """ + Returns a single column of strings by tokenizing the input + strings column using multiple strings as delimiters. + + For details, see cpp:func:`cudf::nvtext::tokenize` + + Parameters + ---------- + input : Column + Strings column to tokenize + delimiters : Column + Strings column used to separate individual strings into tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_tokenize( + input.view(), + delimiters.view(), + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column count_tokens_scalar(Column input, Scalar delimiter=None): + """ + Returns the number of tokens in each string of a strings column + using the provided characters as delimiters. + + For details, see cpp:func:`cudf::nvtext::count_tokens` + + Parameters + ---------- + input : Column + Strings column to count tokens + delimiters : Scalar + String scalar used to separate each string into tokens + + Returns + ------- + Column + New column of token counts + """ + cdef unique_ptr[column] c_result + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_count_tokens( + input.view(), + dereference(delimiter.c_obj.get()), + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column count_tokens_column(Column input, Column delimiters): + """ + Returns the number of tokens in each string of a strings column + using multiple strings as delimiters. + + For details, see cpp:func:`cudf::nvtext::count_tokens` + + Parameters + ---------- + input : Column + Strings column to count tokens + delimiters : Column + Strings column used to separate each string into tokens + + Returns + ------- + Column + New column of token counts + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_count_tokens( + input.view(), + delimiters.view(), + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column character_tokenize(Column input): + """ + Returns a single column of strings by converting + each character to a string. + + For details, see cpp:func:`cudf::nvtext::character_tokens` + + Parameters + ---------- + input : Column + Strings column to tokenize + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_character_tokenize(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column detokenize( + Column input, + Column row_indices, + Scalar separator=None +): + """ + Creates a strings column from a strings column of tokens + and an associated column of row ids. + + For details, see cpp:func:`cudf::nvtext::detokenize` + + Parameters + ---------- + input : Column + Strings column to detokenize + row_indices : Column + The relative output row index assigned for each token in the input column + separator : Scalar + String to append after concatenating each token to the proper output row + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + if separator is None: + separator = Scalar.from_libcudf( + cpp_make_string_scalar(" ".encode()) + ) + + with nogil: + c_result = cpp_detokenize( + input.view(), + row_indices.view(), + dereference(separator.c_obj.get()) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column tokenize_with_vocabulary( + Column input, + TokenizeVocabulary vocabulary, + Scalar delimiter, + size_type default_id=-1 +): + """ + Returns the token ids for the input string by looking + up each delimited token in the given vocabulary. + + For details, see cpp:func:`cudf::nvtext::tokenize_with_vocabulary` + + Parameters + ---------- + input : Column + Strings column to tokenize + vocabulary : TokenizeVocabulary + Used to lookup tokens within ``input`` + delimiter : Scalar + Used to identify tokens within ``input`` + default_id : size_type + The token id to be used for tokens not found in the vocabulary; Default is -1 + + Returns + ------- + Column + Lists column of token ids + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_tokenize_with_vocabulary( + input.view(), + dereference(vocabulary.c_obj.get()), + dereference(delimiter.c_obj.get()), + default_id + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/partitioning.pyi b/python/pylibcudf/pylibcudf/partitioning.pyi new file mode 100644 index 00000000000..48a2ade23f1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def hash_partition( + input: Table, columns_to_hash: list[int], num_partitions: int +) -> tuple[Table, list[int]]: ... +def partition( + t: Table, partition_map: Column, num_partitions: int +) -> tuple[Table, list[int]]: ... +def round_robin_partition( + input: Table, num_partitions: int, start_partition: int = 0 +) -> tuple[Table, list[int]]: ... diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 8fa70daab5a..1dacabceb06 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -11,6 +11,11 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column from .table cimport Table +__all__ = [ + "hash_partition", + "partition", + "round_robin_partition", +] cpdef tuple[Table, list] hash_partition( Table input, @@ -41,10 +46,10 @@ cpdef tuple[Table, list] hash_partition( cdef int c_num_partitions = num_partitions with nogil: - c_result = move( - cpp_partitioning.hash_partition( - input.view(), c_columns_to_hash, c_num_partitions - ) + c_result = cpp_partitioning.hash_partition( + input.view(), + c_columns_to_hash, + c_num_partitions ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) @@ -74,8 +79,10 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit cdef int c_num_partitions = num_partitions with nogil: - c_result = move( - cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions) + c_result = cpp_partitioning.partition( + t.view(), + partition_map.view(), + c_num_partitions ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) @@ -111,10 +118,8 @@ cpdef tuple[Table, list] round_robin_partition( cdef int c_start_partition = start_partition with nogil: - c_result = move( - cpp_partitioning.round_robin_partition( - input.view(), c_num_partitions, c_start_partition - ) + c_result = cpp_partitioning.round_robin_partition( + input.view(), c_num_partitions, c_start_partition ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) diff --git a/python/pylibcudf/pylibcudf/py.typed b/python/pylibcudf/pylibcudf/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi new file mode 100644 index 00000000000..dca6eed013a --- /dev/null +++ b/python/pylibcudf/pylibcudf/quantiles.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from collections.abc import Sequence + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import Interpolation, NullOrder, Order, Sorted + +def quantile( + input: Column, + q: Sequence[float], + interp: Interpolation = Interpolation.LINEAR, + ordered_indices: Column | None = None, + exact: bool = True, +) -> Column: ... +def quantiles( + input: Table, + q: Sequence[float], + interp: Interpolation = Interpolation.NEAREST, + is_input_sorted: Sorted = Sorted.NO, + column_order: list[Order] | None = None, + null_precedence: list[NullOrder] | None = None, +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index 3a771fbe7ef..634218586ac 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -17,6 +17,7 @@ from .column cimport Column from .table cimport Table from .types cimport interpolation +__all__ = ["quantile", "quantiles"] cpdef Column quantile( Column input, @@ -66,14 +67,12 @@ cpdef Column quantile( ordered_indices_view = ordered_indices.view() with nogil: - c_result = move( - cpp_quantile( - input.view(), - q, - interp, - ordered_indices_view, - exact, - ) + c_result = cpp_quantile( + input.view(), + q, + interp, + ordered_indices_view, + exact, ) return Column.from_libcudf(move(c_result)) @@ -141,15 +140,13 @@ cpdef Table quantiles( null_precedence_vec = null_precedence with nogil: - c_result = move( - cpp_quantiles( - input.view(), - q, - interp, - is_input_sorted, - column_order_vec, - null_precedence_vec, - ) + c_result = cpp_quantiles( + input.view(), + q, + interp, + is_input_sorted, + column_order_vec, + null_precedence_vec, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/reduce.pyi b/python/pylibcudf/pylibcudf/reduce.pyi new file mode 100644 index 00000000000..a09949b7b30 --- /dev/null +++ b/python/pylibcudf/pylibcudf/reduce.pyi @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.types import DataType + +class ScanType(IntEnum): + INCLUSIVE = ... + EXCLUSIVE = ... + +def reduce(col: Column, agg: Aggregation, data_type: DataType) -> Scalar: ... +def scan(col: Column, agg: Aggregation, inclusive: ScanType) -> Column: ... +def minmax(col: Column) -> tuple[Scalar, Scalar]: ... diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx index b0212a5b9c1..1d6ffd9de10 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyx +++ b/python/pylibcudf/pylibcudf/reduce.pyx @@ -16,6 +16,7 @@ from .types cimport DataType from pylibcudf.libcudf.reduce import scan_type as ScanType # no-cython-lint +__all__ = ["ScanType", "minmax", "reduce", "scan"] cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): """Perform a reduction on a column @@ -39,12 +40,10 @@ cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): cdef unique_ptr[scalar] result cdef const reduce_aggregation *c_agg = agg.view_underlying_as_reduce() with nogil: - result = move( - cpp_reduce.cpp_reduce( - col.view(), - dereference(c_agg), - data_type.c_obj - ) + result = cpp_reduce.cpp_reduce( + col.view(), + dereference(c_agg), + data_type.c_obj ) return Scalar.from_libcudf(move(result)) @@ -71,12 +70,10 @@ cpdef Column scan(Column col, Aggregation agg, scan_type inclusive): cdef unique_ptr[column] result cdef const scan_aggregation *c_agg = agg.view_underlying_as_scan() with nogil: - result = move( - cpp_reduce.cpp_scan( - col.view(), - dereference(c_agg), - inclusive, - ) + result = cpp_reduce.cpp_scan( + col.view(), + dereference(c_agg), + inclusive, ) return Column.from_libcudf(move(result)) @@ -99,7 +96,7 @@ cpdef tuple minmax(Column col): """ cdef pair[unique_ptr[scalar], unique_ptr[scalar]] result with nogil: - result = move(cpp_reduce.cpp_minmax(col.view())) + result = cpp_reduce.cpp_minmax(col.view()) return ( Scalar.from_libcudf(move(result.first)), diff --git a/python/pylibcudf/pylibcudf/replace.pyi b/python/pylibcudf/pylibcudf/replace.pyi new file mode 100644 index 00000000000..eed7a2a6c52 --- /dev/null +++ b/python/pylibcudf/pylibcudf/replace.pyi @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class ReplacePolicy(IntEnum): + PRECEDING = ... + FOLLOWING = ... + +def replace_nulls( + source_column: Column, replacement: Column | Scalar | ReplacePolicy +) -> Column: ... +def find_and_replace_all( + source_column: Column, + values_to_replace: Column, + replacement_values: Column, +) -> Column: ... +def clamp( + source_column: Column, + lo: Scalar, + hi: Scalar, + lo_replace: Scalar | None = None, + hi_replace: Scalar | None = None, +) -> Column: ... +def normalize_nans_and_zeros( + source_column: Column, inplace: bool = False +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx index 115dee132fd..51be2b29277 100644 --- a/python/pylibcudf/pylibcudf/replace.pyx +++ b/python/pylibcudf/pylibcudf/replace.pyx @@ -15,6 +15,14 @@ from pylibcudf.libcudf.replace import \ from .column cimport Column from .scalar cimport Scalar +__all__ = [ + "ReplacePolicy", + "clamp", + "find_and_replace_all", + "normalize_nans_and_zeros", + "replace_nulls", +] + cpdef Column replace_nulls(Column source_column, ReplacementType replacement): """Replace nulls in source_column. @@ -56,28 +64,23 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement): if isinstance(replacement, ReplacePolicy): policy = replacement with nogil: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), policy) - ) + c_result = cpp_replace.replace_nulls(source_column.view(), policy) return Column.from_libcudf(move(c_result)) else: raise TypeError("replacement must be a Column, Scalar, or replace_policy") with nogil: if ReplacementType is Column: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), replacement.view()) + c_result = cpp_replace.replace_nulls( + source_column.view(), + replacement.view() ) elif ReplacementType is Scalar: - c_result = move( - cpp_replace.replace_nulls( - source_column.view(), dereference(replacement.c_obj) - ) + c_result = cpp_replace.replace_nulls( + source_column.view(), dereference(replacement.c_obj) ) elif ReplacementType is replace_policy: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), replacement) - ) + c_result = cpp_replace.replace_nulls(source_column.view(), replacement) else: assert False, "Internal error. Please contact pylibcudf developers" return Column.from_libcudf(move(c_result)) @@ -109,12 +112,10 @@ cpdef Column find_and_replace_all( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_replace.find_and_replace_all( - source_column.view(), - values_to_replace.view(), - replacement_values.view(), - ) + c_result = cpp_replace.find_and_replace_all( + source_column.view(), + values_to_replace.view(), + replacement_values.view(), ) return Column.from_libcudf(move(c_result)) @@ -156,22 +157,18 @@ cpdef Column clamp( cdef unique_ptr[column] c_result with nogil: if lo_replace is None: - c_result = move( - cpp_replace.clamp( - source_column.view(), - dereference(lo.c_obj), - dereference(hi.c_obj), - ) + c_result = cpp_replace.clamp( + source_column.view(), + dereference(lo.c_obj), + dereference(hi.c_obj), ) else: - c_result = move( - cpp_replace.clamp( - source_column.view(), - dereference(lo.c_obj), - dereference(hi.c_obj), - dereference(lo_replace.c_obj), - dereference(hi_replace.c_obj), - ) + c_result = cpp_replace.clamp( + source_column.view(), + dereference(lo.c_obj), + dereference(hi.c_obj), + dereference(lo_replace.c_obj), + dereference(hi_replace.c_obj), ) return Column.from_libcudf(move(c_result)) @@ -199,9 +196,7 @@ cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False): if inplace: cpp_replace.normalize_nans_and_zeros(source_column.mutable_view()) else: - c_result = move( - cpp_replace.normalize_nans_and_zeros(source_column.view()) - ) + c_result = cpp_replace.normalize_nans_and_zeros(source_column.view()) if not inplace: return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi new file mode 100644 index 00000000000..d8d0ffcc3e0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/reshape.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table + +def interleave_columns(source_table: Table) -> Column: ... +def tile(source_table: Table, count: int) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index eb1499ebbea..bdc212a1985 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.types cimport size_type from .column cimport Column from .table cimport Table +__all__ = ["interleave_columns", "tile"] cpdef Column interleave_columns(Table source_table): """Interleave columns of a table into a single column. @@ -38,7 +39,7 @@ cpdef Column interleave_columns(Table source_table): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_interleave_columns(source_table.view())) + c_result = cpp_interleave_columns(source_table.view()) return Column.from_libcudf(move(c_result)) @@ -63,6 +64,6 @@ cpdef Table tile(Table source_table, size_type count): cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_tile(source_table.view(), count)) + c_result = cpp_tile(source_table.view(), count) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/rolling.pyi b/python/pylibcudf/pylibcudf/rolling.pyi new file mode 100644 index 00000000000..ca0111e01ec --- /dev/null +++ b/python/pylibcudf/pylibcudf/rolling.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import Aggregation +from pylibcudf.column import Column + +def rolling_window[WindowType: (Column, int)]( + source: Column, + preceding_window: WindowType, + following_window: WindowType, + min_periods: int, + agg: Aggregation, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx index a46540d7ffa..11acf57ccf4 100644 --- a/python/pylibcudf/pylibcudf/rolling.pyx +++ b/python/pylibcudf/pylibcudf/rolling.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.types cimport size_type from .aggregation cimport Aggregation from .column cimport Column +__all__ = ["rolling_window"] cpdef Column rolling_window( Column source, @@ -49,24 +50,21 @@ cpdef Column rolling_window( cdef const rolling_aggregation *c_agg = agg.view_underlying_as_rolling() if WindowType is Column: with nogil: - result = move( - cpp_rolling.rolling_window( - source.view(), - preceding_window.view(), - following_window.view(), - min_periods, - dereference(c_agg), - ) + result = cpp_rolling.rolling_window( + source.view(), + preceding_window.view(), + following_window.view(), + min_periods, + dereference(c_agg), ) else: with nogil: - result = move( - cpp_rolling.rolling_window( - source.view(), - preceding_window, - following_window, - min_periods, - dereference(c_agg), - ) + result = cpp_rolling.rolling_window( + source.view(), + preceding_window, + following_window, + min_periods, + dereference(c_agg), ) + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/round.pyi b/python/pylibcudf/pylibcudf/round.pyi new file mode 100644 index 00000000000..410cf5de586 --- /dev/null +++ b/python/pylibcudf/pylibcudf/round.pyi @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column + +class RoundingMethod(IntEnum): + HALF_UP = ... + HALF_EVEN = ... + +def round( + source: Column, + decimal_places: int = 0, + round_method: RoundingMethod = RoundingMethod.HALF_UP, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx index dc60d53b07e..09e5a9cc3bc 100644 --- a/python/pylibcudf/pylibcudf/round.pyx +++ b/python/pylibcudf/pylibcudf/round.pyx @@ -11,6 +11,7 @@ from pylibcudf.libcudf.column.column cimport column from .column cimport Column +__all__ = ["RoundingMethod", "round"] cpdef Column round( Column source, @@ -39,12 +40,10 @@ cpdef Column round( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_round( - source.view(), - decimal_places, - round_method - ) + c_result = cpp_round( + source.view(), + decimal_places, + round_method ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd index 8664dfa4b7e..a273647c98d 100644 --- a/python/pylibcudf/pylibcudf/scalar.pxd +++ b/python/pylibcudf/pylibcudf/scalar.pxd @@ -4,7 +4,7 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.scalar.scalar cimport scalar -from rmm._lib.memory_resource cimport DeviceMemoryResource +from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource from .column cimport Column from .types cimport DataType diff --git a/python/pylibcudf/pylibcudf/scalar.pyi b/python/pylibcudf/pylibcudf/scalar.pyi new file mode 100644 index 00000000000..0b72b10ef86 --- /dev/null +++ b/python/pylibcudf/pylibcudf/scalar.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +class Scalar: + def type(self) -> DataType: ... + def is_valid(self) -> bool: ... + @staticmethod + def empty_like(column: Column) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 3e20938af0c..1ac014e891e 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -6,11 +6,13 @@ from libcpp.utility cimport move from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like -from rmm._lib.memory_resource cimport get_current_device_resource +from rmm.pylibrmm.memory_resource cimport get_current_device_resource from .column cimport Column from .types cimport DataType +__all__ = ["Scalar"] + # The DeviceMemoryResource attribute could be released prematurely # by the gc if the Scalar is in a reference cycle. Removing the tp_clear @@ -37,6 +39,8 @@ cdef class Scalar: # DeviceScalar. raise ValueError("Scalar should be constructed with a factory") + __hash__ = None + cdef const scalar* get(self) noexcept nogil: return self.c_obj.get() diff --git a/python/pylibcudf/pylibcudf/search.pyi b/python/pylibcudf/pylibcudf/search.pyi new file mode 100644 index 00000000000..7f292b129b2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/search.pyi @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, Order + +def lower_bound( + haystack: Table, + needles: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def upper_bound( + haystack: Table, + needles: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def contains(haystack: Column, needles: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index 814bc6553d8..50353fcd0cc 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport null_order, order from .column cimport Column from .table cimport Table +__all__ = ["contains", "lower_bound", "upper_bound"] cpdef Column lower_bound( Table haystack, @@ -41,13 +42,11 @@ cpdef Column lower_bound( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_search.lower_bound( - haystack.view(), - needles.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_search.lower_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -82,13 +81,11 @@ cpdef Column upper_bound( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_search.upper_bound( - haystack.view(), - needles.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_search.upper_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -112,10 +109,8 @@ cpdef Column contains(Column haystack, Column needles): """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_search.contains( - haystack.view(), - needles.view(), - ) + c_result = cpp_search.contains( + haystack.view(), + needles.view(), ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi new file mode 100644 index 00000000000..5255d869a4d --- /dev/null +++ b/python/pylibcudf/pylibcudf/sorting.pyi @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.aggregation import RankMethod +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NullOrder, NullPolicy, Order + +def sorted_order( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def stable_sorted_order( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Column: ... +def rank( + input_view: Column, + method: RankMethod, + column_order: Order, + null_handling: NullPolicy, + null_precedence: NullOrder, + percentage: bool, +) -> Column: ... +def is_sorted( + tbl: Table, column_order: list[Order], null_precedence: list[NullOrder] +) -> bool: ... +def segmented_sort_by_key( + values: Table, + keys: Table, + segment_offsets: Column, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_segmented_sort_by_key( + values: Table, + keys: Table, + segment_offsets: Column, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def sort_by_key( + values: Table, + keys: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_sort_by_key( + values: Table, + keys: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def sort( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... +def stable_sort( + source_table: Table, + column_order: list[Order], + null_precedence: list[NullOrder], +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index 42289d54bca..fb29ef8c571 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -12,6 +12,18 @@ from pylibcudf.libcudf.types cimport null_order, null_policy, order from .column cimport Column from .table cimport Table +__all__ = [ + "is_sorted", + "rank", + "segmented_sort_by_key", + "sort", + "sort_by_key", + "sorted_order", + "stable_segmented_sort_by_key", + "stable_sort", + "stable_sort_by_key", + "stable_sorted_order", +] cpdef Column sorted_order(Table source_table, list column_order, list null_precedence): """Computes the row indices required to sort the table. @@ -36,12 +48,10 @@ cpdef Column sorted_order(Table source_table, list column_order, list null_prece cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sorted_order( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sorted_order( + source_table.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -74,12 +84,10 @@ cpdef Column stable_sorted_order( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sorted_order( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sorted_order( + source_table.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -118,15 +126,13 @@ cpdef Column rank( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_sorting.rank( - input_view.view(), - method, - column_order, - null_handling, - null_precedence, - percentage, - ) + c_result = cpp_sorting.rank( + input_view.view(), + method, + column_order, + null_handling, + null_precedence, + percentage, ) return Column.from_libcudf(move(c_result)) @@ -154,12 +160,10 @@ cpdef bool is_sorted(Table tbl, list column_order, list null_precedence): cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.is_sorted( - tbl.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.is_sorted( + tbl.view(), + c_orders, + c_null_precedence, ) return c_result @@ -197,14 +201,12 @@ cpdef Table segmented_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.segmented_sort_by_key( - values.view(), - keys.view(), - segment_offsets.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.segmented_sort_by_key( + values.view(), + keys.view(), + segment_offsets.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -243,14 +245,12 @@ cpdef Table stable_segmented_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_segmented_sort_by_key( - values.view(), - keys.view(), - segment_offsets.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_segmented_sort_by_key( + values.view(), + keys.view(), + segment_offsets.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -285,13 +285,11 @@ cpdef Table sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sort_by_key( - values.view(), - keys.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sort_by_key( + values.view(), + keys.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -326,13 +324,11 @@ cpdef Table stable_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sort_by_key( - values.view(), - keys.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sort_by_key( + values.view(), + keys.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -360,12 +356,10 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sort( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sort( + source_table.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -393,11 +387,9 @@ cpdef Table stable_sort(Table source_table, list column_order, list null_precede cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sort( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sort( + source_table.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd index a4f39792f0c..a20a23e2e58 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd @@ -17,6 +17,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) +cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) + cpdef Table unique( Table input, list keys, diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi new file mode 100644 index 00000000000..99cade48309 --- /dev/null +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.table import Table +from pylibcudf.types import NanEquality, NanPolicy, NullEquality, NullPolicy + +class DuplicateKeepOption(IntEnum): + KEEP_ANY = ... + KEEP_FIRST = ... + KEEP_LAST = ... + KEEP_NONE = ... + +def drop_nulls( + source_table: Table, keys: list[int], keep_threshold: int +) -> Table: ... +def drop_nans( + source_table: Table, keys: list[int], keep_threshold: int +) -> Table: ... +def apply_boolean_mask(source_table: Table, boolean_mask: Column) -> Table: ... +def unique( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, +) -> Table: ... +def distinct( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Table: ... +def distinct_indices( + input: Table, + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Column: ... +def stable_distinct( + input: Table, + keys: list[int], + keep: DuplicateKeepOption, + nulls_equal: NullEquality, + nans_equal: NanEquality, +) -> Table: ... +def unique_count( + source: Column, null_handling: NullPolicy, nan_handling: NanPolicy +) -> int: ... +def distinct_count( + source: Column, null_handling: NullPolicy, nan_handling: NanPolicy +) -> int: ... diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index d5475ea79d5..6e403ca1b07 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -21,6 +21,18 @@ from pylibcudf.libcudf.stream_compaction import \ from .column cimport Column from .table cimport Table +__all__ = [ + "DuplicateKeepOption", + "apply_boolean_mask", + "distinct", + "distinct_count", + "distinct_indices", + "drop_nans", + "drop_nulls", + "stable_distinct", + "unique", + "unique_count", +] cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of nulls. @@ -44,10 +56,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.drop_nulls( - source_table.view(), c_keys, keep_threshold - ) + c_result = cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold ) return Table.from_libcudf(move(c_result)) @@ -74,10 +84,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.drop_nulls( - source_table.view(), c_keys, keep_threshold - ) + c_result = cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold ) return Table.from_libcudf(move(c_result)) @@ -101,10 +109,8 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """ cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_stream_compaction.apply_boolean_mask( - source_table.view(), boolean_mask.view() - ) + c_result = cpp_stream_compaction.apply_boolean_mask( + source_table.view(), boolean_mask.view() ) return Table.from_libcudf(move(c_result)) @@ -144,10 +150,8 @@ cpdef Table unique( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.unique( - input.view(), c_keys, keep, nulls_equal - ) + c_result = cpp_stream_compaction.unique( + input.view(), c_keys, keep, nulls_equal ) return Table.from_libcudf(move(c_result)) @@ -185,10 +189,8 @@ cpdef Table distinct( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.distinct( - input.view(), c_keys, keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) return Table.from_libcudf(move(c_result)) @@ -221,10 +223,8 @@ cpdef Column distinct_indices( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_stream_compaction.distinct_indices( - input.view(), keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.distinct_indices( + input.view(), keep, nulls_equal, nans_equal ) return Column.from_libcudf(move(c_result)) @@ -262,10 +262,8 @@ cpdef Table stable_distinct( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.stable_distinct( - input.view(), c_keys, keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.stable_distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 142bc124ca2..5d7fbd24b91 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -18,16 +18,22 @@ set(cython_sources case.pyx char_types.pyx contains.pyx + combine.pyx extract.pyx find.pyx + find_multiple.pyx findall.pyx + padding.pyx regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx + replace_re.pyx side_type.pyx slice.pyx strip.pyx + translate.pyx + wrap.pyx ) set(linked_libraries cudf::cudf) @@ -38,3 +44,4 @@ rapids_cython_create_modules( ) add_subdirectory(convert) +add_subdirectory(split) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d8afccc7336..da1c1c576c0 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,16 +5,25 @@ from . cimport ( capitalize, case, char_types, + combine, contains, convert, extract, find, + find_multiple, findall, + padding, regex_flags, regex_program, + repeat, replace, + replace_re, + side_type, slice, + split, strip, + translate, + wrap, ) from .side_type cimport side_type @@ -27,11 +36,18 @@ __all__ = [ "convert", "extract", "find", + "find_multiple", "findall", + "padding", "regex_flags", "regex_program", + "repeat", "replace", + "replace_re", "slice", "strip", + "split", "side_type", + "translate", + "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 22452812e42..67054f0b447 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,34 +5,50 @@ capitalize, case, char_types, + combine, contains, convert, extract, find, + find_multiple, findall, + padding, regex_flags, regex_program, repeat, replace, + replace_re, + side_type, slice, + split, strip, + translate, + wrap, ) from .side_type import SideType __all__ = [ + "SideType", "attributes", "capitalize", "case", "char_types", + "combine", "contains", "convert", "extract", "find", + "find_multiple", "findall", + "padding", "regex_flags", "regex_program", + "repeat", "replace", + "replace_re", "slice", + "split", "strip", - "SideType", + "translate", + "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyi b/python/pylibcudf/pylibcudf/strings/attributes.pyi new file mode 100644 index 00000000000..7fd5c9773d4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def count_characters(source_strings: Column) -> Column: ... +def count_bytes(source_strings: Column) -> Column: ... +def code_points(source_strings: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx index 36bee7bd1d9..f1eb09b4965 100644 --- a/python/pylibcudf/pylibcudf/strings/attributes.pyx +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport attributes as cpp_attributes +__all__ = ["code_points", "count_bytes", "count_characters"] cpdef Column count_characters(Column source_strings): """ @@ -25,7 +26,7 @@ cpdef Column count_characters(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.count_characters(source_strings.view())) + c_result = cpp_attributes.count_characters(source_strings.view()) return Column.from_libcudf(move(c_result)) @@ -48,7 +49,7 @@ cpdef Column count_bytes(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.count_bytes(source_strings.view())) + c_result = cpp_attributes.count_bytes(source_strings.view()) return Column.from_libcudf(move(c_result)) @@ -71,6 +72,6 @@ cpdef Column code_points(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.code_points(source_strings.view())) + c_result = cpp_attributes.code_points(source_strings.view()) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyi b/python/pylibcudf/pylibcudf/strings/capitalize.pyi new file mode 100644 index 00000000000..5c6689418e2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.char_types import StringCharacterTypes + +def capitalize(input: Column, delimiters: Scalar | None = None) -> Column: ... +def title( + input: Column, + sequence_type: StringCharacterTypes = StringCharacterTypes.ALPHA, +) -> Column: ... +def is_title(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx index 06b991c3cf1..a54480b8e4a 100644 --- a/python/pylibcudf/pylibcudf/strings/capitalize.pyx +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx @@ -14,6 +14,7 @@ from pylibcudf.strings.char_types cimport string_character_types from cython.operator import dereference +__all__ = ["capitalize", "is_title", "title"] cpdef Column capitalize( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/case.pyi b/python/pylibcudf/pylibcudf/strings/case.pyi new file mode 100644 index 00000000000..4e50db4d1da --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/case.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def to_lower(input: Column) -> Column: ... +def to_upper(input: Column) -> Column: ... +def swapcase(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx index 9e6cd7717d3..d0e054bef72 100644 --- a/python/pylibcudf/pylibcudf/strings/case.pyx +++ b/python/pylibcudf/pylibcudf/strings/case.pyx @@ -6,6 +6,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport case as cpp_case +__all__ = ["swapcase", "to_lower", "to_upper"] cpdef Column to_lower(Column input): cdef unique_ptr[column] c_result diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd index ad4e4cf61d8..f9f7d244212 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pxd +++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd @@ -1,3 +1,19 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column cimport Column from pylibcudf.libcudf.strings.char_types cimport string_character_types +from pylibcudf.scalar cimport Scalar + + +cpdef Column all_characters_of_type( + Column source_strings, + string_character_types types, + string_character_types verify_types +) + +cpdef Column filter_characters_of_type( + Column source_strings, + string_character_types types_to_remove, + Scalar replacement, + string_character_types types_to_keep +) diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyi b/python/pylibcudf/pylibcudf/strings/char_types.pyi new file mode 100644 index 00000000000..daa36cbb68d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyi @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class StringCharacterTypes(IntEnum): + DECIMAL = ... + NUMERIC = ... + DIGIT = ... + ALPHA = ... + SPACE = ... + UPPER = ... + LOWER = ... + ALPHANUM = ... + CASE_TYPES = ... + ALL_TYPES = ... + +def all_characters_of_type( + source_strings: Column, + types: StringCharacterTypes, + verify_types: StringCharacterTypes, +) -> Column: ... +def filter_characters_of_type( + source_strings: Column, + types_to_remove: StringCharacterTypes, + replacement: Scalar, + types_to_keep: StringCharacterTypes, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index e7621fb4d84..0af4a1f9c37 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -1,4 +1,94 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport char_types as cpp_char_types +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference from pylibcudf.libcudf.strings.char_types import \ string_character_types as StringCharacterTypes # no-cython-lint + +__all__ = [ + "StringCharacterTypes", + "all_characters_of_type", + "filter_characters_of_type", +] + +cpdef Column all_characters_of_type( + Column source_strings, + string_character_types types, + string_character_types verify_types +): + """ + Identifies strings where all characters match the specified type. + + Parameters + ---------- + source_strings : Column + Strings instance for this operation + types : StringCharacterTypes + The character types to check in each string + verify_types : StringCharacterTypes + Only verify against these character types. + + Returns + ------- + Column + New column of boolean results for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_char_types.all_characters_of_type( + source_strings.view(), + types, + verify_types, + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column filter_characters_of_type( + Column source_strings, + string_character_types types_to_remove, + Scalar replacement, + string_character_types types_to_keep +): + """ + Filter specific character types from a column of strings. + + Parameters + ---------- + source_strings : Column + Strings instance for this operation + types_to_remove : StringCharacterTypes + The character types to check in each string. + replacement : Scalar + The replacement character to use when removing characters + types_to_keep : StringCharacterTypes + Default `ALL_TYPES` means all characters of `types_to_remove` + will be filtered. + + Returns + ------- + Column + New column with the specified characters filtered out and + replaced with the specified replacement string. + """ + cdef const string_scalar* c_replacement = ( + replacement.c_obj.get() + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_char_types.filter_characters_of_type( + source_strings.view(), + types_to_remove, + dereference(c_replacement), + types_to_keep, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd new file mode 100644 index 00000000000..ea22f626973 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pxd @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.combine cimport ( + output_if_empty_list, + separator_on_nulls, +) +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep=*, + Scalar col_narep=*, + separator_on_nulls separate_nulls=*, +) + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep) + + +cpdef Column join_list_elements( + Column source_strings, + ColumnOrScalar separator, + Scalar separator_narep, + Scalar string_narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, +) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyi b/python/pylibcudf/pylibcudf/strings/combine.pyi new file mode 100644 index 00000000000..3094b20f141 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyi @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +class SeparatorOnNulls(IntEnum): + YES = ... + NO = ... + +class OutputIfEmptyList(IntEnum): + EMPTY_STRING = ... + NULL_ELEMENT = ... + +def concatenate( + strings_columns: Table, + separator: Column | Scalar, + narep: Scalar | None = None, + col_narep: Scalar | None = None, + separate_nulls: SeparatorOnNulls = SeparatorOnNulls.YES, +) -> Column: ... +def join_strings( + input: Column, separator: Scalar, narep: Scalar +) -> Column: ... +def join_list_elements( + lists_strings_column: Column, + separator: Column | Scalar, + separator_narep: Scalar, + string_narep: Scalar, + separate_nulls: SeparatorOnNulls, + empty_list_policy: OutputIfEmptyList, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx new file mode 100644 index 00000000000..dc1e72c799b --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -0,0 +1,230 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport combine as cpp_combine +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference +from pylibcudf.libcudf.strings.combine import \ + output_if_empty_list as OutputIfEmptyList # no-cython-lint +from pylibcudf.libcudf.strings.combine import \ + separator_on_nulls as SeparatorOnNulls # no-cython-lint + +__all__ = [ + "OutputIfEmptyList", + "SeparatorOnNulls", + "concatenate", + "join_list_elements", + "join_strings", +] + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep=None, + Scalar col_narep=None, + separator_on_nulls separate_nulls=separator_on_nulls.YES, +): + """ + Concatenate all columns in the table horizontally into one new string + delimited by an optional separator string. + + Parameters + ---------- + strings_columns : Table + Strings for this operation + + separator : Column or Scalar + Separator(s) for a given row + + narep : Scalar + String to replace a null separator for a given row. + + col_narep : Scalar + String that should be used in place of any null strings found in any column. + An exception is raised when separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows. + + Returns + ------- + Column + New column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_col_narep + cdef const string_scalar* c_separator + + if narep is None: + narep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + + if ColumnOrScalar is Column: + if col_narep is None: + col_narep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + c_col_narep = ( + col_narep.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + separator.view(), + dereference(c_narep), + dereference(c_col_narep), + separate_nulls + ) + ) + elif ColumnOrScalar is Scalar: + if col_narep is not None: + raise ValueError( + "col_narep cannot be specified when separator is a Scalar" + ) + c_separator = (separator.c_obj.get()) + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + dereference(c_separator), + dereference(c_narep), + separate_nulls + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) + + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep): + """ + Concatenates all strings in the column into one new string delimited + by an optional separator string. + + Parameters + ---------- + input : Column + List of strings columns to concatenate + + separator : Scalar + Strings column that provides the separator for a given row + + narep : Scalar + String to replace any null strings found. + + Returns + ------- + Column + New column containing one string + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator = ( + separator.c_obj.get() + ) + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.join_strings( + input.view(), + dereference(c_separator), + dereference(c_narep), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column join_list_elements( + Column lists_strings_column, + ColumnOrScalar separator, + Scalar separator_narep, + Scalar string_narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, +): + """ + Given a lists column of strings (each row is a list of strings), + concatenates the strings within each row and returns a single strings + column result. + + Parameters + ---------- + lists_strings_column : Column + Column containing lists of strings to concatenate + + separator : Column or Scalar + String(s) that should inserted between each string from each row. + + separator_narep : Scalar + String that should be used to replace a null separator. + + string_narep : Scalar + String to replace null strings in any non-null list row. + Ignored if separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows + if `narep` is valid + + empty_list_policy : OutputIfEmptyList + If set to EMPTY_STRING, any input row that is an empty + list will result in an empty string. Otherwise, it will + result in a null. + + + Returns + ------- + Column + New strings column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator_narep = ( + separator_narep.c_obj.get() + ) + cdef const string_scalar* c_string_narep = ( + string_narep.c_obj.get() + ) + cdef const string_scalar* c_separator + + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_combine.join_list_elements( + lists_strings_column.view(), + separator.view(), + dereference(c_separator_narep), + dereference(c_string_narep), + separate_nulls, + empty_list_policy, + ) + ) + elif ColumnOrScalar is Scalar: + c_separator = (separator.c_obj.get()) + with nogil: + c_result = move( + cpp_combine.join_list_elements( + lists_strings_column.view(), + dereference(c_separator), + dereference(c_separator_narep), + separate_nulls, + empty_list_policy, + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyi b/python/pylibcudf/pylibcudf/strings/contains.pyi new file mode 100644 index 00000000000..1f0620383b3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/contains.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_program import RegexProgram + +def contains_re(input: Column, prog: RegexProgram) -> Column: ... +def count_re(input: Column, prog: RegexProgram) -> Column: ... +def matches_re(input: Column, prog: RegexProgram) -> Column: ... +def like( + input: Column, + pattern: Column | Scalar, + escape_character: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 82bd1fbea32..7b4c53ed853 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["contains_re", "count_re", "like", "matches_re"] cpdef Column contains_re( Column input, @@ -38,10 +39,10 @@ cpdef Column contains_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.contains_re( + result = cpp_contains.contains_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -71,10 +72,10 @@ cpdef Column count_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.count_re( + result = cpp_contains.count_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -105,10 +106,10 @@ cpdef Column matches_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.matches_re( + result = cpp_contains.matches_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -149,19 +150,19 @@ cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character= if ColumnOrScalar is Column: with nogil: - result = move(cpp_contains.like( + result = cpp_contains.like( input.view(), pattern.view(), dereference(c_escape_character) - )) + ) elif ColumnOrScalar is Scalar: c_pattern = (pattern.c_obj.get()) with nogil: - result = move(cpp_contains.like( + result = cpp_contains.like( input.view(), dereference(c_pattern), dereference(c_escape_character) - )) + ) else: raise ValueError("pattern must be a Column or a Scalar") diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 175c9b3738e..8ba84ba7d50 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,10 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx) +set(cython_sources + convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx + convert_floats.pyx convert_integers.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 05324cb49df..85300936e4d 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations +from . cimport ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_floats, + convert_integers, + convert_ipv4, + convert_lists, + convert_urls, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index d803399d53c..08b5034456e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,24 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations +from . import ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_floats, + convert_integers, + convert_ipv4, + convert_lists, + convert_urls, +) + +__all__ = [ + "convert_booleans", + "convert_datetime", + "convert_durations", + "convert_fixed_point", + "convert_floats", + "convert_integers", + "convert_ipv4", + "convert_lists", + "convert_urls", +] diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd new file mode 100644 index 00000000000..312ac3c0ca0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column to_booleans(Column input, Scalar true_string) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi new file mode 100644 index 00000000000..77c09242e9a --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def to_booleans(input: Column, true_string: Scalar) -> Column: ... +def from_booleans( + booleans: Column, true_string: Scalar, false_string: Scalar +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx new file mode 100644 index 00000000000..1899a3b27cc --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -0,0 +1,88 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.convert cimport ( + convert_booleans as cpp_convert_booleans, +) +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + +__all__ = ["from_booleans", "to_booleans"] + +cpdef Column to_booleans(Column input, Scalar true_string): + """ + Returns a new bool column by parsing boolean values from the strings + in the provided strings column. + + For details, see :cpp:func:`cudf::strings::to_booleans`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + true_string : Scalar + String to expect for true. Non-matching strings are false + + Returns + ------- + Column + New bool column converted from strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + + with nogil: + c_result = cpp_convert_booleans.to_booleans( + input.view(), + dereference(c_true_string) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string): + """ + Returns a new strings column converting the boolean values from the + provided column into strings. + + For details, see :cpp:func:`cudf::strings::from_booleans`. + + Parameters + ---------- + booleans : Column + Boolean column to convert. + + true_string : Scalar + String to use for true in the output column. + + false_string : Scalar + String to use for false in the output column. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + cdef const string_scalar* c_false_string = ( + false_string.c_obj.get() + ) + + with nogil: + c_result = cpp_convert_booleans.from_booleans( + booleans.view(), + dereference(c_true_string), + dereference(c_false_string), + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd index 07c84d263d6..80ec168644b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ) + +cpdef Column is_timestamp( + Column input, + str format, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi new file mode 100644 index 00000000000..c6857169765 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_timestamps( + input: Column, timestamp_type: DataType, format: str +) -> Column: ... +def from_timestamps( + timestamps: Column, format: str, input_strings_names: Column +) -> Column: ... +def is_timestamp(input: Column, format: str) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index fcacb096f87..f1cd684166c 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -11,32 +11,79 @@ from pylibcudf.libcudf.strings.convert cimport ( from pylibcudf.types import DataType +__all__ = ["from_timestamps", "is_timestamp", "to_timestamps"] cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ): + """ + Returns a new timestamp column converting a strings column into + timestamps using the provided format pattern. + + For details, see cpp:`cudf::strings::to_timestamps`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + timestamp_type : DataType + The timestamp type used for creating the output column. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New datetime column + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.to_timestamps( input.view(), timestamp_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ): + """ + Returns a new strings column converting a timestamp column into + strings using the provided format pattern. + + For details, see cpp:`cudf::strings::from_timestamps`. + + Parameters + ---------- + timestamps : Column + Timestamp values to convert + + format : str + The string specifying output format. + + input_strings_names : Column + The string names to use for weekdays ("%a", "%A") and months ("%b", "%B"). + + Returns + ------- + Column + New strings column with formatted timestamps. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.from_timestamps( - input.view(), - format, + timestamps.view(), + c_format, input_strings_names.view() ) @@ -44,13 +91,33 @@ cpdef Column from_timestamps( cpdef Column is_timestamp( Column input, - const string& format + str format ): + """ + Verifies the given strings column can be parsed to timestamps + using the provided format pattern. + + For details, see cpp:`cudf::strings::is_timestamp`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New bool column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.is_timestamp( input.view(), - format + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd index ac11b8959ed..eecdade4ef9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -8,10 +8,10 @@ from pylibcudf.types cimport DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=* ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi new file mode 100644 index 00000000000..a5787a5fe49 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_durations( + input: Column, duration_type: DataType, format: str +) -> Column: ... +def from_durations(durations: Column, format: str | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index f3e0b7c9c8e..a9654afd00a 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -11,31 +11,81 @@ from pylibcudf.libcudf.strings.convert cimport ( from pylibcudf.types import DataType +__all__ = ["from_durations", "to_durations"] cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ): + """ + Returns a new duration column converting a strings column into + durations using the provided format pattern. + + For details, see cpp:func:`cudf::strings::to_durations` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + duration_type : DataType + The duration type used for creating the output column. + + format : str + String specifying the duration format in strings. + + Returns + ------- + Column + New duration column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() + with nogil: c_result = cpp_convert_durations.to_durations( input.view(), duration_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=None ): + """ + Returns a new strings column converting a duration column into + strings using the provided format pattern. + + For details, see cpp:func:`cudf::strings::from_durations` + + Parameters + ---------- + durations : Column + Duration values to convert. + + format : str + The string specifying output format. + Default format is "%D days %H:%M:%S". + + Returns + ------- + Column + New strings column with formatted durations. + """ cdef unique_ptr[column] c_result + + if format is None: + format = "%D days %H:%M:%S" + cdef string c_format = format.encode() + with nogil: c_result = cpp_convert_durations.from_durations( - input.view(), - format + durations.view(), + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd new file mode 100644 index 00000000000..049b9b3fffe --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_fixed_point(Column input, DataType output_type) + +cpdef Column from_fixed_point(Column input) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=*) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi new file mode 100644 index 00000000000..1192d3dfcd6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_fixed_point(input: Column, output_type: DataType) -> Column: ... +def from_fixed_point(input: Column) -> Column: ... +def is_fixed_point( + input: Column, decimal_type: DataType | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx new file mode 100644 index 00000000000..00cbc822f36 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -0,0 +1,101 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_fixed_point as cpp_fixed_point, +) +from pylibcudf.types cimport DataType, type_id + +__all__ = ["from_fixed_point", "is_fixed_point", "to_fixed_point"] + + +cpdef Column to_fixed_point(Column input, DataType output_type): + """ + Returns a new fixed-point column parsing decimal values from the + provided strings column. + + For details, see :cpp:func:`cudf::strings::to_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of fixed-point column to return including the scale value. + + Returns + ------- + Column + New column of output_type. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_fixed_point.to_fixed_point( + input.view(), + output_type.c_obj, + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_fixed_point(Column input): + """ + Returns a new strings column converting the fixed-point values + into a strings column. + + For details, see :cpp:func:`cudf::strings::from_fixed_point` + + Parameters + ---------- + input : Column + Fixed-point column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_fixed_point.from_fixed_point(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to fixed-point. + + For details, see :cpp:func:`cudf::strings::is_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + decimal_type : DataType + Fixed-point type (with scale) used only for checking overflow. + Defaults to Decimal64 + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if decimal_type is None: + decimal_type = DataType(type_id.DECIMAL64) + + with nogil: + c_result = cpp_fixed_point.is_fixed_point( + input.view(), + decimal_type.c_obj, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd new file mode 100644 index 00000000000..1284ff552aa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_floats(Column strings, DataType output_type) + +cpdef Column from_floats(Column floats) + +cpdef Column is_float(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi new file mode 100644 index 00000000000..ddf4042e10d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_floats(strings: Column, output_type: DataType) -> Column: ... +def from_floats(floats: Column) -> Column: ... +def is_float(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx new file mode 100644 index 00000000000..b5199aac577 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_floats as cpp_convert_floats, +) +from pylibcudf.types cimport DataType + +__all__ = ["from_floats", "is_float", "to_floats"] + +cpdef Column to_floats(Column strings, DataType output_type): + """ + Returns a new numeric column by parsing float values from each string + in the provided strings column. + + For details, see cpp:func:`cudf::strings::to_floats` + + Parameters + ---------- + strings : Column + Strings instance for this operation. + + output_type : DataType + Type of float numeric column to return. + + Returns + ------- + Column + New column with floats converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_floats.to_floats( + strings.view(), + output_type.c_obj, + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_floats(Column floats): + """ + Returns a new strings column converting the float values from the + provided column into strings. + + For details, see cpp:func:`cudf::strings::from_floats` + + Parameters + ---------- + floats : Column + Numeric column to convert. + + Returns + ------- + Column + New strings column with floats as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_floats.from_floats(floats.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_float(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to floats. + + For details, see cpp:func:`cudf::strings::is_float` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_floats.is_float(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd new file mode 100644 index 00000000000..eff2e080c27 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type) + +cpdef Column from_integers(Column integers) + +cpdef Column is_integer(Column input, DataType int_type=*) + +cpdef Column hex_to_integers(Column input, DataType output_type) + +cpdef Column is_hex(Column input) + +cpdef Column integers_to_hex(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi new file mode 100644 index 00000000000..b96226fba90 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +def to_integers(input: Column, output_type: DataType) -> Column: ... +def from_integers(integers: Column) -> Column: ... +def is_integer(input: Column, int_type: DataType | None = None) -> Column: ... +def hex_to_integers(input: Column, output_type: DataType) -> Column: ... +def is_hex(input: Column) -> Column: ... +def integers_to_hex(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx new file mode 100644 index 00000000000..12984e15ce9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -0,0 +1,214 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_integers as cpp_convert_integers, +) +from pylibcudf.types cimport DataType + +__all__ = [ + "from_integers", + "hex_to_integers", + "integers_to_hex", + "is_hex", + "is_integer", + "to_integers" +] + +cpdef Column to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing integer values from the + provided strings column. + + For details, cpp:func:`cudf::strings::to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_integers(Column integers): + """ + Returns a new strings column converting the integer values from the + provided column into strings. + + For details, cpp:func:`cudf::strings::from_integers`. + + Parameters + ---------- + integers : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column with integers as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.from_integers( + integers.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_integer(Column input, DataType int_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers. + + For details, cpp:func:`cudf::strings::is_integer`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + int_type : DataType + Integer type used for checking underflow and overflow. + By default, does not check an integer type for underflow + or overflow. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if int_type is None: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + ) + ) + else: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + int_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column hex_to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing hexadecimal values + from the provided strings column. + + For details, cpp:func:`cudf::strings::hex_to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.hex_to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_hex(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from hex. + + For details, cpp:func:`cudf::strings::is_hex`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.is_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_hex(Column input): + """ + Returns a new strings column converting integer columns to hexadecimal + characters. + + For details, cpp:func:`cudf::strings::integers_to_hex`. + + Parameters + ---------- + input : Column + Integer column to convert to hex. + + Returns + ------- + Column + New strings column with hexadecimal characters. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.integers_to_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd new file mode 100644 index 00000000000..c61f5c0bdca --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column ipv4_to_integers(Column input) + +cpdef Column integers_to_ipv4(Column integers) + +cpdef Column is_ipv4(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi new file mode 100644 index 00000000000..b017b32598c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def ipv4_to_integers(input: Column) -> Column: ... +def integers_to_ipv4(integers: Column) -> Column: ... +def is_ipv4(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx new file mode 100644 index 00000000000..e7c6aae4fa8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -0,0 +1,81 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 + +__all__ = ["integers_to_ipv4", "ipv4_to_integers", "is_ipv4"] + +cpdef Column ipv4_to_integers(Column input): + """ + Converts IPv4 addresses into integers. + + For details, see cpp:func:`cudf::strings::ipv4_to_integers` + + Parameters + ---------- + input : Column + Strings instance for this operation + + Returns + ------- + Column + New uint32 column converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_ipv4.ipv4_to_integers(input.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_ipv4(Column integers): + """ + Converts integers into IPv4 addresses as strings. + + For details, see cpp:func:`cudf::strings::integers_to_ipv4` + + Parameters + ---------- + integers : Column + Integer (uint32) column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_ipv4.integers_to_ipv4(integers.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_ipv4(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from IPv4 format. + + For details, see cpp:func:`cudf::strings::is_ipv4` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_ipv4.is_ipv4(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd new file mode 100644 index 00000000000..1ba4272afa2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column format_list_column( + Column input, + Scalar na_rep=*, + Column separators=* +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi new file mode 100644 index 00000000000..6ab3a4183e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def format_list_column( + input: Column, + na_rep: Scalar | None = None, + separators: Column | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx new file mode 100644 index 00000000000..518f72f6644 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -0,0 +1,71 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.column_factories cimport make_empty_column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings.convert cimport ( + convert_lists as cpp_convert_lists, +) +from pylibcudf.scalar cimport Scalar +from pylibcudf.types cimport type_id + +from cython.operator import dereference + +__all__ = ["format_list_column"] + +cpdef Column format_list_column( + Column input, + Scalar na_rep=None, + Column separators=None +): + """ + Convert a list column of strings into a formatted strings column. + + For details, see :cpp:func`cudf::strings::format_list_column` + + Parameters + ---------- + input : Column + Lists column to format + + na_rep : Scalar + Replacement string for null elements. + Default, empty string + + separators : Column + Strings to use for enclosing list components and separating elements. + Default, ``,``, ``[``, ``]`` + + Returns + ------- + Column + New strings column + """ + cdef unique_ptr[column] c_result + + if na_rep is None: + na_rep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef const string_scalar* c_na_rep = ( + na_rep.c_obj.get() + ) + + if separators is None: + separators = make_empty_column(type_id.STRING) + + with nogil: + c_result = cpp_convert_lists.format_list_column( + input.view(), + dereference(c_na_rep), + separators.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd new file mode 100644 index 00000000000..da05ce93426 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column url_encode(Column Input) + +cpdef Column url_decode(Column Input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi new file mode 100644 index 00000000000..49b8468957c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def url_encode(input: Column) -> Column: ... +def url_decode(input: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx new file mode 100644 index 00000000000..bd5e23bca43 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx @@ -0,0 +1,56 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls + +__all__ = ["url_decode", "url_encode"] + +cpdef Column url_encode(Column input): + """ + Encodes each string using URL encoding. + + For details, see :cpp:func:`cudf::strings::url_encode` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_urls.url_encode(input.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column url_decode(Column input): + """ + Decodes each string using URL encoding. + + For details, see :cpp:func:`cudf::strings::url_decode` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_urls.url_decode(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyi b/python/pylibcudf/pylibcudf/strings/extract.pyi new file mode 100644 index 00000000000..4354bd3072d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.regex_program import RegexProgram +from pylibcudf.table import Table + +def extract(input: Column, prog: RegexProgram) -> Table: ... +def extract_all_record(input: Column, prog: RegexProgram) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx index dcb11ca10ce..0ce70666e92 100644 --- a/python/pylibcudf/pylibcudf/strings/extract.pyx +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.strings.regex_program cimport RegexProgram from pylibcudf.table cimport Table +__all__ = ["extract", "extract_all_record"] cpdef Table extract(Column input, RegexProgram prog): """ @@ -33,11 +34,9 @@ cpdef Table extract(Column input, RegexProgram prog): cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_extract.extract( - input.view(), - prog.c_obj.get()[0] - ) + c_result = cpp_extract.extract( + input.view(), + prog.c_obj.get()[0] ) return Table.from_libcudf(move(c_result)) @@ -66,11 +65,9 @@ cpdef Column extract_all_record(Column input, RegexProgram prog): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_extract.extract_all_record( - input.view(), - prog.c_obj.get()[0] - ) + c_result = cpp_extract.extract_all_record( + input.view(), + prog.c_obj.get()[0] ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/find.pyi b/python/pylibcudf/pylibcudf/strings/find.pyi new file mode 100644 index 00000000000..3d04a9c3161 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def find( + input: Column, target: Column | Scalar, start: int = 0, stop: int = -1 +) -> Column: ... +def rfind( + input: Column, target: Scalar, start: int = 0, stop: int = -1 +) -> Column: ... +def contains(input: Column, target: Column | Scalar) -> Column: ... +def starts_with(input: Column, target: Column | Scalar) -> Column: ... +def ends_with(input: Column, target: Column | Scalar) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx index 22d370bf7e8..f0af339ff08 100644 --- a/python/pylibcudf/pylibcudf/strings/find.pyx +++ b/python/pylibcudf/pylibcudf/strings/find.pyx @@ -10,6 +10,7 @@ from cython.operator import dereference from pylibcudf.libcudf.scalar.scalar cimport string_scalar +__all__ = ["contains", "ends_with", "find", "rfind", "starts_with"] cpdef Column find( Column input, @@ -50,22 +51,18 @@ cpdef Column find( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.find( - input.view(), - target.view(), - start - ) + result = cpp_find.find( + input.view(), + target.view(), + start ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.find( - input.view(), - dereference((target.c_obj.get())), - start, - stop - ) + result = cpp_find.find( + input.view(), + dereference((target.c_obj.get())), + start, + stop ) else: raise ValueError(f"Invalid target {target}") @@ -104,13 +101,11 @@ cpdef Column rfind( """ cdef unique_ptr[column] result with nogil: - result = move( - cpp_find.rfind( - input.view(), - dereference((target.c_obj.get())), - start, - stop - ) + result = cpp_find.rfind( + input.view(), + dereference((target.c_obj.get())), + start, + stop ) return Column.from_libcudf(move(result)) @@ -149,19 +144,15 @@ cpdef Column contains( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.contains( - input.view(), - target.view() - ) + result = cpp_find.contains( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.contains( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.contains( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") @@ -204,19 +195,15 @@ cpdef Column starts_with( if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.starts_with( - input.view(), - target.view() - ) + result = cpp_find.starts_with( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.starts_with( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.starts_with( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") @@ -256,19 +243,15 @@ cpdef Column ends_with( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.ends_with( - input.view(), - target.view() - ) + result = cpp_find.ends_with( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.ends_with( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.ends_with( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd new file mode 100644 index 00000000000..b7b3aefa336 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column find_multiple(Column input, Column targets) diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyi b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi new file mode 100644 index 00000000000..3d46fd2fa6d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def find_multiple(input: Column, targets: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx new file mode 100644 index 00000000000..c9ce734b4be --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -0,0 +1,38 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple + +__all__ = ["find_multiple"] + +cpdef Column find_multiple(Column input, Column targets): + """ + Returns a lists column with character position values where each + of the target strings are found in each string. + + For details, see :cpp:func:`cudf::strings::find_multiple`. + + Parameters + ---------- + input : Column + Strings instance for this operation + targets : Column + Strings to search for in each string + + Returns + ------- + Column + Lists column with character position values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_find_multiple.find_multiple( + input.view(), + targets.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd index 54afa088141..3c35a9c9aa9 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/strings/findall.pxd @@ -4,4 +4,5 @@ from pylibcudf.column cimport Column from pylibcudf.strings.regex_program cimport RegexProgram +cpdef Column find_re(Column input, RegexProgram pattern) cpdef Column findall(Column input, RegexProgram pattern) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyi b/python/pylibcudf/pylibcudf/strings/findall.pyi new file mode 100644 index 00000000000..77e38581d22 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.regex_program import RegexProgram + +def find_re(input: Column, pattern: RegexProgram) -> Column: ... +def findall(input: Column, pattern: RegexProgram) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 3a6b87504b3..23c84675a16 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -7,6 +7,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport findall as cpp_findall from pylibcudf.strings.regex_program cimport RegexProgram +__all__ = ["findall", "find_re"] cpdef Column findall(Column input, RegexProgram pattern): """ @@ -30,11 +31,39 @@ cpdef Column findall(Column input, RegexProgram pattern): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_findall.findall( - input.view(), - pattern.c_obj.get()[0] - ) + c_result = cpp_findall.findall( + input.view(), + pattern.c_obj.get()[0] + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column find_re(Column input, RegexProgram pattern): + """ + Returns character positions where the pattern first matches + the elements in input strings. + + For details, see :cpp:func:`cudf::strings::find_re` + + Parameters + ---------- + input : Column + Strings instance for this operation + pattern : RegexProgram + Regex pattern + + Returns + ------- + Column + New column of integers + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_findall.find_re( + input.view(), + pattern.c_obj.get()[0] ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/strings/padding.pxd new file mode 100644 index 00000000000..a035a5ad187 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.side_type cimport side_type +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column pad(Column input, size_type width, side_type side, str fill_char) + +cpdef Column zfill(Column input, size_type width) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyi b/python/pylibcudf/pylibcudf/strings/padding.pyi new file mode 100644 index 00000000000..a991935e6e5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.strings.side_type import SideType + +def pad( + input: Column, width: int, side: SideType, fill_char: str +) -> Column: ... +def zfill(input: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx new file mode 100644 index 00000000000..0e349a7be47 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pyx @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport padding as cpp_padding +from pylibcudf.libcudf.strings.side_type cimport side_type + +__all__ = ["pad", "zfill"] + +cpdef Column pad(Column input, size_type width, side_type side, str fill_char): + """ + Add padding to each string using a provided character. + + For details, see :cpp:func:`cudf::strings::pad`. + + Parameters + ---------- + input : Column + Strings instance for this operation + width : int + The minimum number of characters for each string. + side : SideType + Where to place the padding characters. + fill_char : str + Single UTF-8 character to use for padding + + Returns + ------- + Column + New column with padded strings. + """ + cdef unique_ptr[column] c_result + cdef string c_fill_char = fill_char.encode("utf-8") + + with nogil: + c_result = cpp_padding.pad( + input.view(), + width, + side, + c_fill_char, + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column zfill(Column input, size_type width): + """ + Add '0' as padding to the left of each string. + + For details, see :cpp:func:`cudf::strings::zfill`. + + Parameters + ---------- + input : Column + Strings instance for this operation + width : int + The minimum number of characters for each string. + + Returns + ------- + Column + New column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_padding.zfill( + input.view(), + width, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyi b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi new file mode 100644 index 00000000000..c551cebf181 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +class RegexFlags(IntEnum): + DEFAULT = ... + MULTILINE = ... + DOTALL = ... diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx index ce3b6b10a42..65b504e0dc7 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx @@ -2,3 +2,5 @@ from pylibcudf.libcudf.strings.regex_flags import \ regex_flags as RegexFlags # no-cython-lint + +__all__ = ["RegexFlags"] diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyi b/python/pylibcudf/pylibcudf/strings/regex_program.pyi new file mode 100644 index 00000000000..9abd6fa7802 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.strings.regex_flags import RegexFlags + +class RegexProgram: + def __init__(self): ... + @staticmethod + def create(pattern: str, flags: RegexFlags) -> RegexProgram: ... diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx index f426b6888ae..46bfde074d2 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx @@ -11,6 +11,7 @@ from pylibcudf.strings.regex_flags import RegexFlags from pylibcudf.strings.regex_flags cimport regex_flags +__all__ = ["RegexProgram"] cdef class RegexProgram: """Regex program class. @@ -24,6 +25,8 @@ cdef class RegexProgram: def __init__(self, *args, **kwargs): raise ValueError("Do not instantiate RegexProgram directly, use create") + __hash__ = None + @staticmethod def create(str pattern, int flags): """Create a program from a pattern. @@ -37,6 +40,10 @@ cdef class RegexProgram: flags : Uniont[int, RegexFlags] Regex flags for interpreting special characters in the pattern + Returns + ------- + RegexProgram + A new RegexProgram """ cdef unique_ptr[regex_program] c_prog cdef regex_flags c_flags diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyi b/python/pylibcudf/pylibcudf/strings/repeat.pyi new file mode 100644 index 00000000000..93a46b71caa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def repeat_strings(input: Column, repeat_times: Column | int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx index 5f627218f6e..a497b1f438e 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pyx +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type +__all__ = ["repeat_strings"] cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): """ @@ -31,19 +32,15 @@ cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): if ColumnorSizeType is Column: with nogil: - c_result = move( - cpp_repeat.repeat_strings( - input.view(), - repeat_times.view() - ) + c_result = cpp_repeat.repeat_strings( + input.view(), + repeat_times.view() ) elif ColumnorSizeType is size_type: with nogil: - c_result = move( - cpp_repeat.repeat_strings( - input.view(), - repeat_times - ) + c_result = cpp_repeat.repeat_strings( + input.view(), + repeat_times ) else: raise ValueError("repeat_times must be size_type or integer") diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyi b/python/pylibcudf/pylibcudf/strings/replace.pyi new file mode 100644 index 00000000000..64df09ef7e8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace.pyi @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def replace( + input: Column, target: Scalar, repl: Scalar, maxrepl: int = -1 +) -> Column: ... +def replace_multiple( + input: Column, target: Column, repl: Column, maxrepl: int = -1 +) -> Column: ... +def replace_slice( + input: Column, repl: Scalar | None = None, start: int = 0, stop: int = -1 +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx index 9d0ebf4a814..3ba6c1b5530 100644 --- a/python/pylibcudf/pylibcudf/strings/replace.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace.pyx @@ -16,6 +16,7 @@ from pylibcudf.libcudf.strings.replace cimport ( from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +__all__ = ["replace", "replace_multiple", "replace_slice"] cpdef Column replace( Column input, @@ -55,12 +56,12 @@ cpdef Column replace( repl_str = (repl.c_obj.get()) with nogil: - c_result = move(cpp_replace( + c_result = cpp_replace( input.view(), target_str[0], repl_str[0], maxrepl, - )) + ) return Column.from_libcudf(move(c_result)) @@ -98,11 +99,11 @@ cpdef Column replace_multiple( cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_replace_multiple( + c_result = cpp_replace_multiple( input.view(), target.view(), repl.view(), - )) + ) return Column.from_libcudf(move(c_result)) @@ -136,6 +137,7 @@ cpdef Column replace_slice( Start position where repl will be added. stop : size_type, default -1 End position (exclusive) to use for replacement. + Returns ------- pylibcudf.Column @@ -151,11 +153,11 @@ cpdef Column replace_slice( cdef const string_scalar* scalar_str = (repl.c_obj.get()) with nogil: - c_result = move(cpp_replace_slice( + c_result = cpp_replace_slice( input.view(), scalar_str[0], start, stop - )) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/strings/replace_re.pxd new file mode 100644 index 00000000000..e27ccd55f7d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_flags cimport regex_flags +from pylibcudf.strings.regex_program cimport RegexProgram + +ctypedef fused Replacement: + Column + Scalar + +ctypedef fused Patterns: + RegexProgram + list + + +cpdef Column replace_re( + Column input, + Patterns patterns, + Replacement replacement=*, + size_type max_replace_count=*, + regex_flags flags=* +) + +cpdef Column replace_with_backrefs( + Column input, + RegexProgram prog, + str replacement +) diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyi b/python/pylibcudf/pylibcudf/strings/replace_re.pyi new file mode 100644 index 00000000000..056bafbf7ef --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyi @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from typing import overload + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_flags import RegexFlags +from pylibcudf.strings.regex_program import RegexProgram + +@overload +def replace_re( + input: Column, + pattern: RegexProgram, + replacement: Scalar, + max_replace_count: int = -1, +) -> Column: ... +@overload +def replace_re( + input: Column, + patterns: list[str], + replacement: Column, + max_replace_count: int = -1, + flags: RegexFlags = RegexFlags.DEFAULT, +) -> Column: ... +def replace_with_backrefs( + input: Column, prog: RegexProgram, replacement: str +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx new file mode 100644 index 00000000000..bdabc779ddf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx @@ -0,0 +1,135 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport replace_re as cpp_replace_re +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_flags cimport regex_flags +from pylibcudf.strings.regex_program cimport RegexProgram + +__all__ = ["replace_re", "replace_with_backrefs"] + +cpdef Column replace_re( + Column input, + Patterns patterns, + Replacement replacement=None, + size_type max_replace_count=-1, + regex_flags flags=regex_flags.DEFAULT, +): + """ + For each string, replaces any character sequence matching the given patterns + with the provided replacement. + + For details, see :cpp:func:`cudf::strings::replace_re` + + Parameters + ---------- + input : Column + Strings instance for this operation. + patterns: RegexProgram or list[str] + If RegexProgram, the regex to match to each string. + If list[str], a list of regex strings to search within each string. + replacement : Scalar or Column + If Scalar, the string used to replace the matched sequence in each string. + ``patterns`` must be a RegexProgram. + If Column, the strings used for replacement. + ``patterns`` must be a list[str]. + max_replace_count : int + The maximum number of times to replace the matched pattern + within each string. ``patterns`` must be a RegexProgram. + Default replaces every substring that is matched. + flags : RegexFlags + Regex flags for interpreting special characters in the patterns. + ``patterns`` must be a list[str] + + Returns + ------- + Column + New strings column + """ + cdef unique_ptr[column] c_result + cdef vector[string] c_patterns + + if Patterns is RegexProgram and Replacement is Scalar: + if replacement is None: + replacement = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + with nogil: + c_result = move( + cpp_replace_re.replace_re( + input.view(), + patterns.c_obj.get()[0], + dereference((replacement.get())), + max_replace_count + ) + ) + + return Column.from_libcudf(move(c_result)) + elif Patterns is list and Replacement is Column: + c_patterns.reserve(len(patterns)) + for pattern in patterns: + c_patterns.push_back(pattern.encode()) + + with nogil: + c_result = move( + cpp_replace_re.replace_re( + input.view(), + c_patterns, + replacement.view(), + flags, + ) + ) + + return Column.from_libcudf(move(c_result)) + else: + raise TypeError("Must pass either a RegexProgram and a Scalar or a list") + + +cpdef Column replace_with_backrefs( + Column input, + RegexProgram prog, + str replacement +): + """ + For each string, replaces any character sequence matching the given regex + using the replacement template for back-references. + + For details, see :cpp:func:`cudf::strings::replace_with_backrefs` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + prog: RegexProgram + Regex program instance. + + replacement : str + The replacement template for creating the output string. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef string c_replacement = replacement.encode() + + with nogil: + c_result = cpp_replace_re.replace_with_backrefs( + input.view(), + prog.c_obj.get()[0], + c_replacement, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd index 34b7a580380..34b03e9bc27 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd @@ -1,3 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from pylibcudf.libcudf.strings.side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyi b/python/pylibcudf/pylibcudf/strings/side_type.pyi new file mode 100644 index 00000000000..532edd60077 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyi @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum + +class SideType(IntEnum): + LEFT = ... + RIGHT = ... + BOTH = ... diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx index acdc7d6ff1f..87db4206a9c 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pyx +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -1,4 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from pylibcudf.libcudf.strings.side_type import \ side_type as SideType # no-cython-lint + +__all__ = ["SideType"] diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyi b/python/pylibcudf/pylibcudf/strings/slice.pyi new file mode 100644 index 00000000000..7bf9a7cb8c6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/slice.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +def slice_strings( + input: Column, + start: Column | Scalar | None = None, + stop: Column | Scalar | None = None, + step: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx index 70d10cab36c..d32de7c50e0 100644 --- a/python/pylibcudf/pylibcudf/strings/slice.pyx +++ b/python/pylibcudf/pylibcudf/strings/slice.pyx @@ -14,6 +14,7 @@ from pylibcudf.scalar cimport Scalar from cython.operator import dereference +__all__ = ["slice_strings"] cpdef Column slice_strings( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt new file mode 100644 index 00000000000..8f544f6f537 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources partition.pyx split.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd similarity index 56% rename from python/custreamz/custreamz/tests/pytest.ini rename to python/pylibcudf/pylibcudf/strings/split/__init__.pxd index 7b0a9f29fb1..72086e57d9f 100644 --- a/python/custreamz/custreamz/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd @@ -1,4 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native +from . cimport partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py new file mode 100644 index 00000000000..db2a597882e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . import partition, split + +__all__ = ["partition", "split"] diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd new file mode 100644 index 00000000000..c18257a4787 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + + +cpdef Table partition(Column input, Scalar delimiter=*) + +cpdef Table rpartition(Column input, Scalar delimiter=*) diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyi b/python/pylibcudf/pylibcudf/strings/split/partition.pyi new file mode 100644 index 00000000000..f19a463bd7e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyi @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.table import Table + +def partition(input: Column, delimiter: Scalar | None = None) -> Table: ... +def rpartition(input: Column, delimiter: Scalar | None = None) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx new file mode 100644 index 00000000000..75537ea46d3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings.split cimport partition as cpp_partition +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference + +__all__ = ["partition", "rpartition"] + +cpdef Table partition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::partition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_partition.partition( + input.view(), + dereference(c_delimiter) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rpartition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rpartition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_partition.rpartition( + input.view(), + dereference(c_delimiter) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd new file mode 100644 index 00000000000..355a1874298 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyi b/python/pylibcudf/pylibcudf/strings/split/split.pyi new file mode 100644 index 00000000000..3ccf0bc2a01 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyi @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.regex_program import RegexProgram +from pylibcudf.table import Table + +def split( + strings_column: Column, delimiter: Scalar, maxsplit: int +) -> Table: ... +def rsplit( + strings_column: Column, delimiter: Scalar, maxsplit: int +) -> Table: ... +def split_record( + strings: Column, delimiter: Scalar, maxsplit: int +) -> Column: ... +def rsplit_record( + strings: Column, delimiter: Scalar, maxsplit: int +) -> Column: ... +def split_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ... +def rsplit_re(input: Column, prog: RegexProgram, maxsplit: int) -> Table: ... +def split_record_re( + input: Column, prog: RegexProgram, maxsplit: int +) -> Column: ... +def rsplit_record_re( + input: Column, prog: RegexProgram, maxsplit: int +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx new file mode 100644 index 00000000000..90087f996f0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -0,0 +1,320 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.split cimport split as cpp_split +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + +from cython.operator import dereference + +__all__ = [ + "rsplit", + "rsplit_re", + "rsplit_record", + "rsplit_record_re", + "split", + "split_re", + "split_record", + "split_record_re", +] + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::split`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = cpp_split.split( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns. + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = cpp_split.rsplit( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings. + + For details, see :cpp:func:`cudf::strings::split_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = cpp_split.split_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings starting + from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = cpp_split.rsplit_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = cpp_split.split_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string starting from + the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = cpp_split.rsplit_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_split.split_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string starting from the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_split.rsplit_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyi b/python/pylibcudf/pylibcudf/strings/strip.pyi new file mode 100644 index 00000000000..680355fc88f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pyi @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar +from pylibcudf.strings.side_type import SideType + +def strip( + input: Column, + side: SideType = SideType.BOTH, + to_strip: Scalar | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx index 429a23c3cdf..805d959891b 100644 --- a/python/pylibcudf/pylibcudf/strings/strip.pyx +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings cimport strip as cpp_strip from pylibcudf.scalar cimport Scalar from pylibcudf.strings.side_type cimport side_type +__all__ = ["strip"] cpdef Column strip( Column input, diff --git a/python/pylibcudf/pylibcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/strings/translate.pxd new file mode 100644 index 00000000000..0ca746801d7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.translate cimport filter_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column translate(Column input, dict chars_table) + +cpdef Column filter_characters( + Column input, + dict characters_to_filter, + filter_type keep_characters, + Scalar replacement +) diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyi b/python/pylibcudf/pylibcudf/strings/translate.pyi new file mode 100644 index 00000000000..7158b6eb05c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pyi @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from collections.abc import Mapping +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.scalar import Scalar + +class FilterType(IntEnum): + KEEP = ... + REMOVE = ... + +def translate( + input: Column, chars_table: Mapping[int | str, int | str] +) -> Column: ... +def filter_characters( + input: Column, + characters_to_filter: Mapping[int | str, int | str], + keep_characters: FilterType, + replacement: Scalar, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx new file mode 100644 index 00000000000..ba1e8dc5d27 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -0,0 +1,119 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport translate as cpp_translate +from pylibcudf.libcudf.types cimport char_utf8 +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference +from pylibcudf.libcudf.strings.translate import \ + filter_type as FilterType # no-cython-lint + +__all__ = ["FilterType", "filter_characters", "translate"] + +cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table): + """ + Convert str.maketrans table to cudf compatible table. + """ + cdef int table_size = len(table) + cdef vector[pair[char_utf8, char_utf8]] c_table + + c_table.reserve(table_size) + for key, value in table.items(): + if isinstance(value, int): + value = chr(value) + if isinstance(value, str): + value = int.from_bytes(value.encode(), byteorder='big') + if isinstance(key, int): + key = chr(key) + if isinstance(key, str): + key = int.from_bytes(key.encode(), byteorder='big') + c_table.push_back((key, value)) + + return c_table + + +cpdef Column translate(Column input, dict chars_table): + """ + Translates individual characters within each string. + + For details, see :cpp:func:`cudf::strings::translate`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + chars_table : dict + Table of UTF-8 character mappings + + Returns + ------- + Column + New column with padded strings. + """ + cdef unique_ptr[column] c_result + cdef vector[pair[char_utf8, char_utf8]] c_chars_table = _table_to_c_table( + chars_table + ) + + with nogil: + c_result = cpp_translate.translate( + input.view(), + c_chars_table + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column filter_characters( + Column input, + dict characters_to_filter, + filter_type keep_characters, + Scalar replacement +): + """ + Removes ranges of characters from each string in a strings column. + + For details, see :cpp:func:`cudf::strings::filter_characters`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + characters_to_filter : dict + Table of character ranges to filter on + + keep_characters : FilterType + If true, the `characters_to_filter` are retained + and all other characters are removed. + + replacement : Scalar + Replacement string for each character removed. + + Returns + ------- + Column + New column with filtered strings. + """ + cdef unique_ptr[column] c_result + cdef vector[pair[char_utf8, char_utf8]] c_characters_to_filter = _table_to_c_table( + characters_to_filter + ) + cdef const string_scalar* c_replacement = ( + replacement.c_obj.get() + ) + + with nogil: + c_result = cpp_translate.filter_characters( + input.view(), + c_characters_to_filter, + keep_characters, + dereference(c_replacement), + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/strings/wrap.pxd new file mode 100644 index 00000000000..fcc86650acf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column wrap(Column input, size_type width) diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyi b/python/pylibcudf/pylibcudf/strings/wrap.pyi new file mode 100644 index 00000000000..5658f279197 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def wrap(input: Column, width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx new file mode 100644 index 00000000000..b696eb48e47 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport wrap as cpp_wrap +from pylibcudf.libcudf.types cimport size_type + +__all__ = ["wrap"] + +cpdef Column wrap(Column input, size_type width): + """ + Wraps strings onto multiple lines shorter than `width` by + replacing appropriate white space with + new-line characters (ASCII 0x0A). + + For details, see :cpp:func:`cudf::strings::wrap`. + + Parameters + ---------- + input : Column + String column + + width : int + Maximum character width of a line within each string + + Returns + ------- + Column + Column of wrapped strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_wrap.wrap( + input.view(), + width, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/table.pyi b/python/pylibcudf/pylibcudf/table.pyi new file mode 100644 index 00000000000..5aef7e009c8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/table.pyi @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class Table: + def __init__(self, column: list[Column]): ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def columns(self) -> list[Column]: ... diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx index 5f77b89a605..0c1e88a927c 100644 --- a/python/pylibcudf/pylibcudf/table.pyx +++ b/python/pylibcudf/pylibcudf/table.pyx @@ -10,6 +10,7 @@ from pylibcudf.libcudf.table.table cimport table from .column cimport Column +__all__ = ["Table"] cdef class Table: """A list of columns of the same size. @@ -24,6 +25,8 @@ cdef class Table: raise ValueError("All columns must be pylibcudf Column objects") self._columns = columns + __hash__ = None + cdef table_view view(self) nogil: """Generate a libcudf table_view to pass to libcudf algorithms. @@ -49,9 +52,7 @@ cdef class Table: calling libcudf algorithms, and should generally not be needed by users (even direct pylibcudf Cython users). """ - cdef vector[unique_ptr[column]] c_columns = move( - dereference(libcudf_tbl).release() - ) + cdef vector[unique_ptr[column]] c_columns = dereference(libcudf_tbl).release() cdef vector[unique_ptr[column]].size_type i return Table([ diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py index 9f389fa42c4..58c94713d09 100644 --- a/python/pylibcudf/pylibcudf/tests/common/utils.py +++ b/python/pylibcudf/pylibcudf/tests/common/utils.py @@ -7,10 +7,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from pyarrow.orc import write_table as orc_write_table from pyarrow.parquet import write_table as pq_write_table + +import pylibcudf as plc from pylibcudf.io.types import CompressionType @@ -384,12 +385,10 @@ def make_source(path_or_buf, pa_table, format, **kwargs): NESTED_STRUCT_TESTING_TYPE, ] +NON_NESTED_PA_TYPES = NUMERIC_PA_TYPES + STRING_PA_TYPES + BOOL_PA_TYPES + DEFAULT_PA_TYPES = ( - NUMERIC_PA_TYPES - + STRING_PA_TYPES - + BOOL_PA_TYPES - + LIST_PA_TYPES - + DEFAULT_PA_STRUCT_TESTING_TYPES + NON_NESTED_PA_TYPES + LIST_PA_TYPES + DEFAULT_PA_STRUCT_TESTING_TYPES ) # Map pylibcudf compression types to pandas ones diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py index fdce6f353ca..36ab6798d8a 100644 --- a/python/pylibcudf/pylibcudf/tests/conftest.py +++ b/python/pylibcudf/pylibcudf/tests/conftest.py @@ -8,22 +8,38 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest + +import pylibcudf as plc from pylibcudf.io.types import CompressionType sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) -from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES +from utils import ( + ALL_PA_TYPES, + DEFAULT_PA_TYPES, + NON_NESTED_PA_TYPES, + NUMERIC_PA_TYPES, +) + +def _type_to_str(typ): + if isinstance(typ, pa.ListType): + return f"list[{_type_to_str(typ.value_type)}]" + elif isinstance(typ, pa.StructType): + return f"struct[{', '.join(_type_to_str(typ.field(i).type) for i in range(typ.num_fields))}]" + else: + return str(typ) -# This fixture defines the standard set of types that all tests should default to + +# This fixture defines [the standard set of types that all tests should default to # running on. If there is a need for some tests to run on a different set of types, that # type list fixture should also be defined below here if it is likely to be reused # across modules. Otherwise it may be defined on a per-module basis. @pytest.fixture( scope="session", params=DEFAULT_PA_TYPES, + ids=_type_to_str, ) def pa_type(request): return request.param @@ -68,29 +84,13 @@ def _get_vals_of_type(pa_type, length, seed): ) -# TODO: Consider adding another fixture/adapting this -# fixture to consider nullability -@pytest.fixture(scope="session", params=[0, 100]) -def table_data(request): - """ - Returns (TableWithMetadata, pa_table). - - This is the default fixture you should be using for testing - pylibcudf I/O writers. - - Contains one of each category (e.g. int, bool, list, struct) - of dtypes. - """ - nrows = request.param - +# TODO: Consider adapting this helper function +# to consider nullability +def _generate_table_data(types, nrows, seed=42): table_dict = {} - # Colnames in the format expected by - # plc.io.TableWithMetadata colnames = [] - seed = 42 - - for typ in ALL_PA_TYPES: + for typ in types: child_colnames = [] def _generate_nested_data(typ): @@ -140,6 +140,32 @@ def _generate_nested_data(typ): ), pa_table +@pytest.fixture(scope="session", params=[0, 100]) +def table_data(request): + """ + Returns (TableWithMetadata, pa_table). + + This is the default fixture you should be using for testing + pylibcudf I/O writers. + + Contains one of each category (e.g. int, bool, list, struct) + of dtypes. + """ + nrows = request.param + return _generate_table_data(ALL_PA_TYPES, nrows) + + +@pytest.fixture(scope="session", params=[0, 100]) +def table_data_with_non_nested_pa_types(request): + """ + Returns (TableWithMetadata, pa_table). + + This fixture is for testing with non-nested PyArrow types. + """ + nrows = request.param + return _generate_table_data(NON_NESTED_PA_TYPES, nrows) + + @pytest.fixture(params=[(0, 0), ("half", 0), (-1, "half")]) def nrows_skiprows(table_data, request): """ diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py index 0cd5064a697..3d9d99ffa61 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py @@ -5,10 +5,11 @@ import fastavro import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_and_meta_eq +import pylibcudf as plc + avro_dtype_pairs = [ ("boolean", pa.bool_()), ("int", pa.int32()), diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index ab26f23418d..90d2d0896a5 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -5,16 +5,18 @@ import pandas as pd import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.io.types import CompressionType from utils import ( _convert_types, assert_table_and_meta_eq, make_source, + sink_to_str, write_source_str, ) +import pylibcudf as plc +from pylibcudf.io.types import CompressionType + # Shared kwargs to pass to make_source _COMMON_CSV_SOURCE_KWARGS = { "format": "csv", @@ -281,3 +283,87 @@ def test_read_csv_header(csv_table_data, source_or_sink, header): # list true_values = None, # list false_values = None, # bool dayfirst = False, + + +@pytest.mark.parametrize("sep", [",", "*"]) +@pytest.mark.parametrize("lineterminator", ["\n", "\n\n"]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("rows_per_chunk", [8, 100]) +def test_write_csv( + table_data_with_non_nested_pa_types, + source_or_sink, + sep, + lineterminator, + header, + rows_per_chunk, +): + plc_tbl_w_meta, pa_table = table_data_with_non_nested_pa_types + sink = source_or_sink + + plc.io.csv.write_csv( + ( + plc.io.csv.CsvWriterOptions.builder( + plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl + ) + .names(plc_tbl_w_meta.column_names()) + .na_rep("") + .include_header(header) + .rows_per_chunk(rows_per_chunk) + .line_terminator(lineterminator) + .inter_column_delimiter(sep) + .true_value("True") + .false_value("False") + .build() + ) + ) + + # Convert everything to string to make comparisons easier + str_result = sink_to_str(sink) + + pd_result = pa_table.to_pandas().to_csv( + sep=sep, + lineterminator=lineterminator, + header=header, + index=False, + ) + + assert str_result == pd_result + + +@pytest.mark.parametrize("na_rep", ["", "NA"]) +def test_write_csv_na_rep(na_rep): + names = ["a", "b"] + pa_tbl = pa.Table.from_arrays( + [pa.array([1.0, 2.0, None]), pa.array([True, None, False])], + names=names, + ) + plc_tbl = plc.interop.from_arrow(pa_tbl) + plc_tbl_w_meta = plc.io.types.TableWithMetadata( + plc_tbl, column_names=[(name, []) for name in names] + ) + + sink = io.StringIO() + + plc.io.csv.write_csv( + ( + plc.io.csv.CsvWriterOptions.builder( + plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl + ) + .names(plc_tbl_w_meta.column_names()) + .na_rep(na_rep) + .include_header(True) + .rows_per_chunk(8) + .line_terminator("\n") + .inter_column_delimiter(",") + .true_value("True") + .false_value("False") + .build() + ) + ) + + # Convert everything to string to make comparisons easier + str_result = sink_to_str(sink) + + pd_result = pa_tbl.to_pandas().to_csv(na_rep=na_rep, index=False) + + assert str_result == pd_result diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py index 9d976fedf00..453e5ce32a8 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py @@ -3,9 +3,7 @@ import pandas as pd import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.io.types import CompressionType from utils import ( assert_table_and_meta_eq, make_source, @@ -13,6 +11,9 @@ write_source_str, ) +import pylibcudf as plc +from pylibcudf.io.types import CompressionType + # Shared kwargs to pass to make_source _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 42b14b1feff..5ed660ba6cf 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import _convert_types, assert_table_and_meta_eq, make_source +import pylibcudf as plc + # Shared kwargs to pass to make_source _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py index f6e843ccf66..41298601539 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py @@ -1,9 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from pyarrow.parquet import read_table +from utils import assert_table_and_meta_eq, make_source + +import pylibcudf as plc from pylibcudf.expressions import ( ASTOperator, ColumnNameReference, @@ -11,7 +13,6 @@ Literal, Operation, ) -from utils import assert_table_and_meta_eq, make_source # Shared kwargs to pass to make_source _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py index 747f58ec8cf..0c43c363e55 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py @@ -2,9 +2,10 @@ import io -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo]) def io_class(request): diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py new file mode 100644 index 00000000000..f69e940e34e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize( + "source_func", + [ + "make_source", + "make_source_from_file", + ], +) +@pytest.mark.parametrize("options", [None, plc.io.text.ParseOptions()]) +def test_multibyte_split(source_func, options, tmp_path): + data = "x::y::z" + func = getattr(plc.io.text, source_func) + if source_func == "make_source": + source = func(data) + elif source_func == "make_source_from_file": + fle = tmp_path / "fle.txt" + fle.write_text(data) + source = func(str(fle)) + result = plc.io.text.multibyte_split(source, "::", options) + expected = pa.array(["x::", "y::", "z"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py index 76b0424b2af..b3555013927 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import zoneinfo -import pylibcudf as plc import pytest +import pylibcudf as plc + def test_make_timezone_transition_table(): if len(zoneinfo.TZPATH) == 0: diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini deleted file mode 100644 index f572f85ca49..00000000000 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -xfail_strict = true -filterwarnings = - error - ignore:::.*xdist.* - ignore:::.*pytest.* -addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py index f784cb3c191..a33122221f6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py +++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py @@ -4,10 +4,11 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + def idfn(param): ltype, rtype, outtype, plc_op, _ = param @@ -540,13 +541,6 @@ def py_shift_right_unsigned(x, y): plc.binaryop.BinaryOperator.LOGICAL_AND, pa.compute.and_, ), - ( - "int64", - "int64", - "int64", - plc.binaryop.BinaryOperator.LOGICAL_AND, - pa.compute.and_, - ), ( "int64", "int64", @@ -561,13 +555,6 @@ def py_shift_right_unsigned(x, y): plc.binaryop.BinaryOperator.LOGICAL_OR, pa.compute.or_, ), - ( - "int64", - "int64", - "int64", - plc.binaryop.BinaryOperator.LOGICAL_OR, - pa.compute.or_, - ), ( "int64", "int64", diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py index 8cedbc6d42f..e1c8a8303ec 100644 --- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py +++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq +import pylibcudf as plc + EMPTY_COL_SIZE = 3 NUMERIC_TYPES = [ @@ -104,7 +105,7 @@ def test_make_empty_column_dtype(pa_type): plc_type = plc.interop.from_arrow(pa_col).type() if isinstance(pa_type, (pa.ListType, pa.StructType)): - with pytest.raises(ValueError): + with pytest.raises(TypeError): plc.column_factories.make_empty_column(plc_type) return @@ -118,7 +119,7 @@ def test_make_empty_column_typeid(pa_type): tid = plc.interop.from_arrow(pa_col).type().id() if isinstance(pa_type, (pa.ListType, pa.StructType)): - with pytest.raises(ValueError): + with pytest.raises(TypeError): plc.column_factories.make_empty_column(tid) return @@ -153,7 +154,7 @@ def test_make_numeric_column(numeric_pa_type, mask_state): ) def test_make_numeric_column_dtype_err(non_numeric_pa_type): plc_type = plc.interop.from_arrow(non_numeric_pa_type) - with pytest.raises(ValueError): + with pytest.raises(TypeError): plc.column_factories.make_numeric_column( plc_type, 3, plc.types.MaskState.UNALLOCATED ) @@ -182,7 +183,7 @@ def test_make_fixed_point_column(fixed_point_pa_type, mask_state): ) def test_make_fixed_point_column_dtype_err(non_fixed_point_pa_type): plc_type = plc.interop.from_arrow(non_fixed_point_pa_type) - with pytest.raises(ValueError): + with pytest.raises(TypeError): plc.column_factories.make_fixed_point_column( plc_type, 3, plc.types.MaskState.UNALLOCATED ) @@ -210,7 +211,7 @@ def test_make_timestamp_column(timestamp_pa_type, mask_state): ) def test_make_timestamp_column_dtype_err(non_timestamp_pa_type): plc_type = plc.interop.from_arrow(non_timestamp_pa_type) - with pytest.raises(ValueError): + with pytest.raises(TypeError): plc.column_factories.make_timestamp_column( plc_type, 3, plc.types.MaskState.UNALLOCATED ) @@ -238,7 +239,7 @@ def test_make_duration_column(duration_pa_type, mask_state): ) def test_make_duration_column_dtype_err(non_duration_pa_type): plc_type = plc.interop.from_arrow(non_duration_pa_type) - with pytest.raises(ValueError): + with pytest.raises(TypeError): plc.column_factories.make_duration_column( plc_type, 3, plc.types.MaskState.UNALLOCATED ) diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py index 0e129fdf0ef..24cd6b9e35f 100644 --- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py +++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py @@ -1,12 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq import rmm +import pylibcudf as plc + VALID_TYPES = [ pa.int8(), pa.int16(), diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py new file mode 100644 index 00000000000..6d8b5993964 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_table_eq + +import pylibcudf as plc + +param_pyarrow_tables = [ + pa.table([]), + pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), + pa.table({"a": [1, 2, 3]}), + pa.table({"a": [1], "b": [2], "c": [3]}), + pa.table({"a": ["a", "bb", "ccc"]}), + pa.table({"a": [1, 2, None], "b": [None, 3, 4]}), + pa.table( + { + "a": [["a", "b"], ["cde"]], + "b": [ + {"alpha": [1, 2], "beta": None}, + {"alpha": [3, 4], "beta": 5}, + ], + } + ), +] + + +@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables) +def test_pack_and_unpack(arrow_tbl): + plc_tbl = plc.interop.from_arrow(arrow_tbl) + packed = plc.contiguous_split.pack(plc_tbl) + + res = plc.contiguous_split.unpack(packed) + assert_table_eq(arrow_tbl, res) + + +@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables) +def test_pack_and_unpack_from_memoryviews(arrow_tbl): + plc_tbl = plc.interop.from_arrow(arrow_tbl) + packed = plc.contiguous_split.pack(plc_tbl) + + metadata, gpudata = packed.release() + + with pytest.raises(ValueError, match="Cannot release empty"): + packed.release() + + del packed # `metadata` and `gpudata` will survive + + res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpudata) + assert_table_eq(arrow_tbl, res) diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py index 628682d0a66..c0a41b96b1a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_copying.py +++ b/python/pylibcudf/pylibcudf/tests/test_copying.py @@ -2,7 +2,6 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import ( DEFAULT_STRUCT_TESTING_TYPE, @@ -16,6 +15,8 @@ metadata_from_arrow_type, ) +import pylibcudf as plc + # TODO: consider moving this to conftest and "pairing" # it with pa_type, so that they don't get out of sync diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index 89c96829e71..f5f24ef28e2 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,26 +1,14 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import calendar import datetime -import functools import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq - -@pytest.fixture -def date_column(has_nulls): - values = [ - datetime.date(1999, 1, 1), - datetime.date(2024, 10, 12), - datetime.date(1, 1, 1), - datetime.date(9999, 1, 1), - ] - if has_nulls: - values[2] = None - return plc.interop.from_arrow(pa.array(values, type=pa.date32())) +import pylibcudf as plc @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) @@ -40,24 +28,186 @@ def datetime_column(has_nulls, request): ) -@pytest.mark.parametrize( - "component, pc_fun", - [ - ("year", pc.year), - ("month", pc.month), - ("day", pc.day), - ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)), - ("hour", pc.hour), - ("minute", pc.minute), - ("second", pc.second), - ("millisecond", pc.millisecond), - ("microsecond", pc.microsecond), - ("nanosecond", pc.nanosecond), +@pytest.fixture( + params=[ + ("year", plc.datetime.DatetimeComponent.YEAR), + ("month", plc.datetime.DatetimeComponent.MONTH), + ("day", plc.datetime.DatetimeComponent.DAY), + ("day_of_week", plc.datetime.DatetimeComponent.WEEKDAY), + ("hour", plc.datetime.DatetimeComponent.HOUR), + ("minute", plc.datetime.DatetimeComponent.MINUTE), + ("second", plc.datetime.DatetimeComponent.SECOND), + ("millisecond", plc.datetime.DatetimeComponent.MILLISECOND), + ("microsecond", plc.datetime.DatetimeComponent.MICROSECOND), + ("nanosecond", plc.datetime.DatetimeComponent.NANOSECOND), ], + ids=lambda x: x[0], ) -def test_extraction(datetime_column, component, pc_fun): +def component(request): + return request.param + + +@pytest.fixture( + params=[ + ("day", plc.datetime.RoundingFrequency.DAY), + ("hour", plc.datetime.RoundingFrequency.HOUR), + ("minute", plc.datetime.RoundingFrequency.MINUTE), + ("second", plc.datetime.RoundingFrequency.SECOND), + ("millisecond", plc.datetime.RoundingFrequency.MILLISECOND), + ("microsecond", plc.datetime.RoundingFrequency.MICROSECOND), + ("nanosecond", plc.datetime.RoundingFrequency.NANOSECOND), + ] +) +def rounding_frequency(request): + return request.param + + +def test_extract_datetime_component(datetime_column, component): + attr, component = component + kwargs = {} + if attr == "day_of_week": + kwargs = {"count_from_zero": False} got = plc.datetime.extract_datetime_component(datetime_column, component) # libcudf produces an int16, arrow produces an int64 - expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16()) + expect = getattr(pc, attr)( + plc.interop.to_arrow(datetime_column), **kwargs + ).cast(pa.int16()) + + assert_column_eq(expect, got) + + +@pytest.mark.parametrize( + "datetime_func", + [ + "extract_millisecond_fraction", + "extract_microsecond_fraction", + "extract_nanosecond_fraction", + ], +) +def test_datetime_extracting_functions(datetime_column, datetime_func): + pa_col = plc.interop.to_arrow(datetime_column) + got = getattr(plc.datetime, datetime_func)(datetime_column) + kwargs = {} + attr = datetime_func.split("_")[1] + if attr == "weekday": + kwargs = {"count_from_zero": False} + attr = "day_of_week" + expect = getattr(pc, attr)(pa_col, **kwargs).cast(pa.int16()) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize( + "op", + [ + ("ceil_temporal", "ceil_datetimes"), + ("floor_temporal", "floor_datetimes"), + ("round_temporal", "round_datetimes"), + ], +) +def test_rounding_operations(datetime_column, op, rounding_frequency): + got = getattr(plc.datetime, op[1])(datetime_column, rounding_frequency[1]) + pa_col = plc.interop.to_arrow(datetime_column) + pa_got = plc.interop.to_arrow(got) + expect = getattr(pc, op[0])( + pa_col, + unit=rounding_frequency[0], + ).cast(pa_got.type) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize( + "months", + [ + pa.scalar(-3, pa.int32()), + pa.scalar(1, pa.int16()), + pa.array([1, -3, 2, 4, -1, 5], pa.int32()), + ], +) +def test_calendrical_months(datetime_column, months): + def add_calendrical_months(timestamps, months): + result = [] + if isinstance(months, pa.Array): + months_list = months.to_pylist() + else: + months_list = [months.as_py()] * len(timestamps) + for i, dt in enumerate(timestamps): + if dt.as_py() is not None: + year, month = dt.as_py().year, dt.as_py().month + new_month = month + months_list[i] + new_year = year + (new_month - 1) // 12 + result.append( + dt.as_py().replace( + year=new_year, month=(new_month - 1) % 12 + 1 + ) + ) + else: + result.append(None) + return pa.array(result) + + pa_col = plc.interop.to_arrow(datetime_column) + got = plc.datetime.add_calendrical_months( + datetime_column, plc.interop.from_arrow(months) + ) + pa_got = plc.interop.to_arrow(got) + expect = add_calendrical_months(pa_col, months).cast(pa_got.type) + assert_column_eq(expect, got) + + +def test_day_of_year(datetime_column): + got = plc.datetime.day_of_year(datetime_column) + pa_got = plc.interop.to_arrow(got) + pa_col = plc.interop.to_arrow(datetime_column) + expect = pa.array( + [ + d.as_py().timetuple().tm_yday if d.as_py() is not None else None + for d in pa_col + ], + type=pa_got.type, + ) + assert_column_eq(expect, got) + + +def test_is_leap_year(datetime_column): + got = plc.datetime.is_leap_year(datetime_column) + pa_col = plc.interop.to_arrow(datetime_column) + expect = pc.is_leap_year(pa_col) + assert_column_eq(expect, got) + + +def test_last_day_of_month(datetime_column): + def last_day_of_month(dates): + return [ + d.replace(day=calendar.monthrange(d.year, d.month)[1]) + if d is not None + else d + for d in dates.to_pylist() + ] + + got = plc.datetime.last_day_of_month(datetime_column) + pa_got = plc.interop.to_arrow(got) + pa_col = plc.interop.to_arrow(datetime_column) + expect = pa.array(last_day_of_month(pa_col), type=pa_got.type) + assert_column_eq(expect, got) + + +def test_extract_quarter(datetime_column): + got = plc.datetime.extract_quarter(datetime_column) + pa_col = plc.interop.to_arrow(datetime_column) + pa_got = plc.interop.to_arrow(got) + expect = pc.quarter(pa_col).cast(pa_got.type) + assert_column_eq(expect, got) + + +def test_days_in_month(datetime_column): + def days_in_month(dates): + return [ + calendar.monthrange(d.year, d.month)[1] if d is not None else None + for d in dates.to_pylist() + ] + + got = plc.datetime.days_in_month(datetime_column) + pa_col = plc.interop.to_arrow(datetime_column) + pa_got = plc.interop.to_arrow(got) + expect = pa.array(days_in_month(pa_col), type=pa_got.type) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py index 5894ef4624c..52c81c49b9d 100644 --- a/python/pylibcudf/pylibcudf/tests/test_expressions.py +++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py @@ -1,10 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc +import pyarrow.compute as pc import pytest +from utils import assert_column_eq -# We can't really evaluate these expressions, so just make sure -# construction works properly +import pylibcudf as plc def test_literal_construction_invalid(): @@ -22,7 +22,7 @@ def test_literal_construction_invalid(): ], ) def test_columnref_construction(tableref): - plc.expressions.ColumnReference(1.0, tableref) + plc.expressions.ColumnReference(1, tableref) def test_columnnameref_construction(): @@ -47,3 +47,35 @@ def test_columnnameref_construction(): ) def test_astoperation_construction(kwargs): plc.expressions.Operation(**kwargs) + + +def test_evaluation(): + table_h = pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + lit = pa.scalar(42, type=pa.int64()) + table = plc.interop.from_arrow(table_h) + # expr = abs(b * c - (a + 42)) + expr = plc.expressions.Operation( + plc.expressions.ASTOperator.ABS, + plc.expressions.Operation( + plc.expressions.ASTOperator.SUB, + plc.expressions.Operation( + plc.expressions.ASTOperator.MUL, + plc.expressions.ColumnReference(1), + plc.expressions.ColumnReference(2), + ), + plc.expressions.Operation( + plc.expressions.ASTOperator.ADD, + plc.expressions.ColumnReference(0), + plc.expressions.Literal(plc.interop.from_arrow(lit)), + ), + ), + ) + + expect = pc.abs( + pc.subtract( + pc.multiply(table_h["b"], table_h["c"]), pc.add(table_h["a"], lit) + ) + ) + got = plc.transform.compute_column(table, expr) + + assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_filling.py b/python/pylibcudf/pylibcudf/tests/test_filling.py new file mode 100644 index 00000000000..91c7e42a0a0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_filling.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime + +import pyarrow as pa +import pytest +from utils import assert_column_eq, assert_table_eq + +import pylibcudf as plc + + +@pytest.fixture +def pa_col(): + return pa.array([2, 3, 5, 7, 11]) + + +@pytest.fixture +def pa_table(): + pa_col = pa.array([1, 2, 3]) + return pa.table([pa_col], names=["a"]) + + +def test_fill(pa_col): + result = plc.filling.fill( + plc.interop.from_arrow(pa_col), + 1, + 3, + plc.interop.from_arrow(pa.scalar(5)), + ) + expect = pa.array([2, 5, 5, 7, 11]) + assert_column_eq(result, expect) + + +def test_fill_in_place(pa_col): + result = plc.interop.from_arrow(pa_col) + plc.filling.fill_in_place( + result, + 1, + 3, + plc.interop.from_arrow(pa.scalar(5)), + ) + expect = pa.array([2, 5, 5, 7, 11]) + assert_column_eq(result, expect) + + +def test_sequence(): + size = 5 + init_scalar = plc.interop.from_arrow(pa.scalar(10)) + step_scalar = plc.interop.from_arrow(pa.scalar(2)) + result = plc.filling.sequence( + size, + init_scalar, + step_scalar, + ) + expect = pa.array([10, 12, 14, 16, 18]) + assert_column_eq(result, expect) + + +def test_repeat_with_count_int(pa_table): + input_table = plc.interop.from_arrow(pa_table) + count = 2 + result = plc.filling.repeat(input_table, count) + expect = pa.table([[1, 1, 2, 2, 3, 3]], names=["a"]) + assert_table_eq(expect, result) + + +def test_repeat_with_count_column(pa_table): + input_table = plc.interop.from_arrow(pa_table) + count = plc.interop.from_arrow(pa.array([1, 2, 3])) + result = plc.filling.repeat(input_table, count) + expect = pa.table([[1] + [2] * 2 + [3] * 3], names=["a"]) + assert_table_eq(expect, result) + + +def test_calendrical_month_sequence(): + n = 5 + init_date = datetime(2020, 1, 31) + init = plc.interop.from_arrow( + pa.scalar(init_date, type=pa.timestamp("ms")) + ) + months = 1 + result = plc.filling.calendrical_month_sequence(n, init, months) + expected_dates = [ + datetime(2020, 1, 31), + datetime(2020, 2, 29), + datetime(2020, 3, 31), + datetime(2020, 4, 30), + datetime(2020, 5, 31), + ] + expect = pa.array(expected_dates, type=pa.timestamp("ms")) + assert_column_eq(result, expect) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py new file mode 100644 index 00000000000..83fb50fa4ef --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -0,0 +1,269 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import hashlib +import struct + +import mmh3 +import numpy as np +import pyarrow as pa +import pytest +import xxhash +from utils import assert_column_eq, assert_table_eq + +import pylibcudf as plc + +SEED = 0 +METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] + + +def scalar_to_binary(x): + if isinstance(x, str): + return x.encode() + elif isinstance(x, float): + return struct.pack("> 2))) + + +def uint_hash_combine_32(lhs, rhs): + return hash_combine_32(np.uint32(lhs), np.uint32(rhs)) + + +def libcudf_mmh3_x86_32(binary): + seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + hashval = mmh3.hash(binary, seed) + return hash_combine_32(seed, hashval) + + +@pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()]) +def scalar_type(request): + return request.param + + +@pytest.fixture +def pa_scalar_input_column(scalar_type): + if pa.types.is_integer(scalar_type) or pa.types.is_floating(scalar_type): + return pa.array([1, 2, 3], type=scalar_type) + elif pa.types.is_string(scalar_type): + return pa.array(["a", "b", "c"], type=scalar_type) + elif pa.types.is_boolean(scalar_type): + return pa.array([True, True, False], type=scalar_type) + + +@pytest.fixture +def plc_scalar_input_tbl(pa_scalar_input_column): + return plc.interop.from_arrow( + pa.Table.from_arrays([pa_scalar_input_column], names=["data"]) + ) + + +@pytest.fixture(scope="module") +def list_struct_table(): + data = pa.Table.from_pydict( + { + "list": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + "struct": [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}], + } + ) + return data + + +def python_hash_value(x, method): + if method == "murmurhash3_x86_32": + return libcudf_mmh3_x86_32(x) + elif method == "murmurhash3_x64_128": + hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + hasher.update(x) + # libcudf returns a tuple of two 64-bit integers + return hasher.utupledigest() + elif method == "xxhash_64": + return xxhash.xxh64( + x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() + else: + return getattr(hashlib, method)(x).hexdigest() + + +@pytest.mark.parametrize( + "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"] +) +def test_hash_column_sha_md5( + pa_scalar_input_column, plc_scalar_input_tbl, method +): + plc_hasher = getattr(plc.hashing, method) + + def py_hasher(val): + return getattr(hashlib, method)(scalar_to_binary(val)).hexdigest() + + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.string(), + ) + got = plc_hasher(plc_scalar_input_tbl) + assert_column_eq(got, expect) + + +def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + return xxhash.xxh64( + scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() + + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.uint64(), + ) + got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0) + + assert_column_eq(got, expect) + + +@pytest.mark.parametrize( + "method", ["sha1", "sha224", "sha256", "sha384", "sha512"] +) +@pytest.mark.parametrize("dtype", ["list", "struct"]) +def test_sha_list_struct_err(list_struct_table, dtype, method): + err_types = list_struct_table.select([dtype]) + plc_tbl = plc.interop.from_arrow(err_types) + plc_hasher = getattr(plc.hashing, method) + + with pytest.raises(TypeError): + plc_hasher(plc_tbl) + + +def test_md5_struct_err(list_struct_table): + err_types = list_struct_table.select(["struct"]) + plc_tbl = plc.interop.from_arrow(err_types) + + with pytest.raises(TypeError): + plc.hashing.md5(plc_tbl) + + +def test_murmurhash3_x86_32(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + return libcudf_mmh3_x86_32(scalar_to_binary(val)) + + got = plc.hashing.murmurhash3_x86_32(plc_scalar_input_tbl, 0) + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.murmurhash3_x86_32(plc_scalar_input_tbl, 0) + assert_column_eq(got, expect) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_murmurhash3_x86_32_list(): + pa_tbl = pa.Table.from_pydict( + { + "list": pa.array( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], type=pa.list_(pa.uint32()) + ) + } + ) + plc_tbl = plc.interop.from_arrow(pa_tbl) + + def hash_list(list_): + hash_value = uint_hash_combine_32(0, hash_single_uint32(len(list_))) + + for element in list_: + hash_value = uint_hash_combine_32( + hash_value, + hash_single_uint32( + element, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ), + ) + + final = uint_hash_combine_32( + plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value + ) + return final + + expect = pa.array( + [hash_list(val) for val in pa_tbl["list"].to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.murmurhash3_x86_32( + plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + assert_column_eq(got, expect) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_murmurhash3_x86_32_struct(): + pa_tbl = pa.table( + { + "struct": pa.array( + [ + {"a": 1, "b": 2, "c": 3}, + {"a": 4, "b": 5, "c": 6}, + {"a": 7, "b": 8, "c": 9}, + ], + type=pa.struct( + [ + pa.field("a", pa.uint32()), + pa.field("b", pa.uint32(), pa.field("c", pa.uint32())), + ] + ), + ) + } + ) + plc_tbl = plc.interop.from_arrow(pa_tbl) + + def hash_struct(s): + seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + keys = list(s.keys()) + + combined_hash = hash_single_uint32(s[keys[0]], seed=seed) + combined_hash = uint_hash_combine_32(0, combined_hash) + combined_hash = uint_hash_combine_32(seed, combined_hash) + + for key in keys[1:]: + current_hash = hash_single_uint32(s[key], seed=seed) + combined_hash = uint_hash_combine_32(combined_hash, current_hash) + + return combined_hash + + got = plc.hashing.murmurhash3_x86_32( + plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + + expect = pa.array( + [hash_struct(val) for val in pa_tbl["struct"].to_pylist()], + type=pa.uint32(), + ) + assert_column_eq(got, expect) + + +def test_murmurhash3_x64_128(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + hasher.update(val) + return hasher.utupledigest() + + tuples = [ + py_hasher(scalar_to_binary(val)) + for val in pa_scalar_input_column.to_pylist() + ] + expect = pa.Table.from_arrays( + [ + pa.array([np.uint64(t[0]) for t in tuples]), + pa.array([np.uint64(t[1]) for t in tuples]), + ], + names=["0", "1"], + ) + got = plc.hashing.murmurhash3_x64_128(plc_scalar_input_tbl, 0) + + assert_table_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py index 01c998f16d4..af80b6e5978 100644 --- a/python/pylibcudf/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/pylibcudf/tests/test_interop.py @@ -1,8 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import cupy as cp +import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest +from utils import assert_table_eq + +import pylibcudf as plc def test_list_dtype_roundtrip(): @@ -66,3 +70,31 @@ def test_decimal_other(data_type): arrow_type = plc.interop.to_arrow(data_type, precision=precision) assert arrow_type == pa.decimal128(precision, 0) + + +def test_round_trip_dlpack_plc_table(): + expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]}) + plc_table = plc.interop.from_arrow(expected) + result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table)) + assert_table_eq(expected, result) + + +@pytest.mark.parametrize("array", [np.array, cp.array]) +def test_round_trip_dlpack_array(array): + arr = array([1, 2, 3]) + result = plc.interop.from_dlpack(arr.__dlpack__()) + expected = pa.table({"a": [1, 2, 3]}) + assert_table_eq(expected, result) + + +def test_to_dlpack_error(): + plc_table = plc.interop.from_arrow( + pa.table({"a": [1, None, 3], "b": [5, 6, 7]}) + ) + with pytest.raises(ValueError, match="Cannot create a DLPack tensor"): + plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table)) + + +def test_from_dlpack_error(): + with pytest.raises(ValueError, match="Invalid PyCapsule object"): + plc.interop.from_dlpack(1) diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py index 61e02f4d28d..56cf421780b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_join.py +++ b/python/pylibcudf/pylibcudf/tests/test_join.py @@ -2,16 +2,45 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc +import pytest from utils import assert_table_eq +import pylibcudf as plc + + +@pytest.fixture +def left(): + return pa.Table.from_arrays( + [[0, 1, 2, 100], [3, 4, 5, None]], + schema=pa.schema({"a": pa.int32(), "b": pa.int32()}), + ) + -def test_cross_join(): - left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"]) - right = pa.Table.from_arrays( - [[6, 7, 8, 9], [10, 11, 12, 13]], names=["c", "d"] +@pytest.fixture +def right(): + return pa.Table.from_arrays( + [[-1, -2, 0, 1, -3], [10, 3, 4, 5, None]], + schema=pa.schema({"c": pa.int32(), "d": pa.int32()}), ) + +@pytest.fixture +def expr(): + return plc.expressions.Operation( + plc.expressions.ASTOperator.LESS, + plc.expressions.ColumnReference( + 0, plc.expressions.TableReference.LEFT + ), + plc.expressions.ColumnReference( + 0, plc.expressions.TableReference.RIGHT + ), + ) + + +def test_cross_join(left, right): + # Remove the nulls so the calculation of the expected result works + left = left[:-1] + right = right[:-1] pleft = plc.interop.from_arrow(left) pright = plc.interop.from_arrow(right) @@ -26,3 +55,121 @@ def test_cross_join(): got = plc.join.cross_join(pleft, pright) assert_table_eq(expect, got) + + +sentinel = np.iinfo(np.int32).min + + +@pytest.mark.parametrize( + "join_type,expect_left,expect_right", + [ + (plc.join.conditional_inner_join, {0}, {3}), + (plc.join.conditional_left_join, {0, 1, 2, 3}, {3, sentinel}), + ( + plc.join.conditional_full_join, + {0, 1, 2, 3, sentinel}, + {0, 1, 2, 3, 4, sentinel}, + ), + ], + ids=["inner", "left", "full"], +) +def test_conditional_join( + left, right, expr, join_type, expect_left, expect_right +): + pleft = plc.interop.from_arrow(left) + pright = plc.interop.from_arrow(right) + + g_left, g_right = map(plc.interop.to_arrow, join_type(pleft, pright, expr)) + + assert set(g_left.to_pylist()) == expect_left + assert set(g_right.to_pylist()) == expect_right + + +@pytest.mark.parametrize( + "join_type,expect", + [ + (plc.join.conditional_left_semi_join, {0}), + (plc.join.conditional_left_anti_join, {1, 2, 3}), + ], + ids=["semi", "anti"], +) +def test_conditional_semianti_join(left, right, expr, join_type, expect): + pleft = plc.interop.from_arrow(left) + pright = plc.interop.from_arrow(right) + + g_left = plc.interop.to_arrow(join_type(pleft, pright, expr)) + + assert set(g_left.to_pylist()) == expect + + +@pytest.mark.parametrize( + "join_type,expect_left,expect_right", + [ + (plc.join.mixed_inner_join, set(), set()), + (plc.join.mixed_left_join, {0, 1, 2, 3}, {sentinel}), + ( + plc.join.mixed_full_join, + {0, 1, 2, 3, sentinel}, + {0, 1, 2, 3, 4, sentinel}, + ), + ], + ids=["inner", "left", "full"], +) +@pytest.mark.parametrize( + "null_equality", + [plc.types.NullEquality.EQUAL, plc.types.NullEquality.UNEQUAL], + ids=["nulls_equal", "nulls_not_equal"], +) +def test_mixed_join( + left, right, expr, join_type, expect_left, expect_right, null_equality +): + pleft = plc.interop.from_arrow(left) + pright = plc.interop.from_arrow(right) + + g_left, g_right = map( + plc.interop.to_arrow, + join_type( + plc.Table(pleft.columns()[1:]), + plc.Table(pright.columns()[1:]), + pleft, + pright, + expr, + null_equality, + ), + ) + + assert set(g_left.to_pylist()) == expect_left + assert set(g_right.to_pylist()) == expect_right + + +@pytest.mark.parametrize( + "join_type,expect", + [ + (plc.join.mixed_left_semi_join, set()), + (plc.join.mixed_left_anti_join, {0, 1, 2, 3}), + ], + ids=["semi", "anti"], +) +@pytest.mark.parametrize( + "null_equality", + [plc.types.NullEquality.EQUAL, plc.types.NullEquality.UNEQUAL], + ids=["nulls_equal", "nulls_not_equal"], +) +def test_mixed_semianti_join( + left, right, expr, join_type, expect, null_equality +): + pleft = plc.interop.from_arrow(left) + pright = plc.interop.from_arrow(right) + + g_left = plc.interop.to_arrow( + join_type( + plc.Table(pleft.columns()[1:]), + plc.Table(pright.columns()[1:]), + pleft, + pright, + expr, + null_equality, + ) + ) + + assert set(g_left.to_pylist()) == expect diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py new file mode 100644 index 00000000000..486a9524e92 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_json.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def plc_col(): + arr = pa.array( + ['{"foo": {"bar": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None] + ) + return plc.interop.from_arrow(arr) + + +@pytest.fixture(scope="module") +def json_path(): + slr = pa.scalar("$.foo.bar") + return plc.interop.from_arrow(slr) + + +@pytest.mark.parametrize("allow_single_quotes", [True, False]) +@pytest.mark.parametrize("strip_quotes_from_single_strings", [True, False]) +@pytest.mark.parametrize("missing_fields_as_nulls", [True, False]) +def test_get_json_object( + plc_col, + json_path, + allow_single_quotes, + strip_quotes_from_single_strings, + missing_fields_as_nulls, +): + result = plc.json.get_json_object( + plc_col, + json_path, + plc.json.GetJsonObjectOptions( + allow_single_quotes=allow_single_quotes, + strip_quotes_from_single_strings=strip_quotes_from_single_strings, + missing_fields_as_nulls=missing_fields_as_nulls, + ), + ) + expected = pa.array(['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index f7fb7463b50..946d583d1cc 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -1,12 +1,17 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + -@pytest.mark.parametrize("left_inclusive", [True, False]) -@pytest.mark.parametrize("right_inclusive", [True, False]) +@pytest.mark.parametrize( + "left_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO] +) +@pytest.mark.parametrize( + "right_inclusive", [plc.labeling.Inclusive.YES, plc.labeling.Inclusive.NO] +) def test_label_bins(left_inclusive, right_inclusive): in_col = plc.interop.from_arrow(pa.array([1, 2, 3])) left_edges = plc.interop.from_arrow(pa.array([0, 5])) diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py index 2353a6ff8f9..8c1229c2a04 100644 --- a/python/pylibcudf/pylibcudf/tests/test_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_lists.py @@ -3,10 +3,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def test_data(): @@ -61,12 +62,12 @@ def test_concatenate_rows(test_data): [ ( [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]], - False, + plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW, [[1, 2, 3, 4, 5], None], ), ( [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]], - True, + plc.lists.ConcatenateNullPolicy.IGNORE, [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]], ), ], @@ -137,7 +138,9 @@ def test_index_of_scalar(list_column, scalar): plc_column = plc.interop.from_arrow(arr) plc_scalar = plc.interop.from_arrow(scalar) - res = plc.lists.index_of(plc_column, plc_scalar, True) + res = plc.lists.index_of( + plc_column, plc_scalar, plc.lists.DuplicateFindOption.FIND_FIRST + ) expect = pa.array([1, -1, -1, -1], type=pa.int32()) @@ -149,7 +152,9 @@ def test_index_of_list_column(list_column, search_key_column): arr2, expect = search_key_column plc_column1 = plc.interop.from_arrow(arr1) plc_column2 = plc.interop.from_arrow(arr2) - res = plc.lists.index_of(plc_column1, plc_column2, True) + res = plc.lists.index_of( + plc_column1, plc_column2, plc.lists.DuplicateFindOption.FIND_FIRST + ) expect = pa.array(search_key_column[1], type=pa.int32()) @@ -226,39 +231,34 @@ def test_sequences(): @pytest.mark.parametrize( - "ascending,na_position,expected", + "order,na_position,expected", [ ( - True, + plc.types.Order.ASCENDING, plc.types.NullOrder.BEFORE, [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]], ), ( - True, + plc.types.Order.ASCENDING, plc.types.NullOrder.AFTER, [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]], ), ( - False, + plc.types.Order.DESCENDING, plc.types.NullOrder.BEFORE, [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]], ), ( - False, - plc.types.NullOrder.AFTER, - [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]], - ), - ( - False, + plc.types.Order.DESCENDING, plc.types.NullOrder.AFTER, [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]], ), ], ) -def test_sort_lists(lists_column, ascending, na_position, expected): +def test_sort_lists(lists_column, order, na_position, expected): plc_column = plc.interop.from_arrow(pa.array(lists_column)) - res = plc.lists.sort_lists(plc_column, ascending, na_position, False) - res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True) + res = plc.lists.sort_lists(plc_column, order, na_position, False) + res_stable = plc.lists.sort_lists(plc_column, order, na_position, True) expect = pa.array(expected) @@ -271,44 +271,44 @@ def test_sort_lists(lists_column, ascending, na_position, expected): [ ( plc.lists.difference_distinct, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [[], [1, 2, 3], None, [4, 5]], ), ( plc.lists.difference_distinct, - False, - True, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [[], [1, 2, 3], None, [4, None, 5]], ), ( plc.lists.have_overlap, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [True, False, None, True], ), ( plc.lists.have_overlap, - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [True, False, None, False], ), ( plc.lists.intersect_distinct, - True, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, [[np.nan, 1, 2], [], None, [None]], ), ( plc.lists.intersect_distinct, - True, - False, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.UNEQUAL, [[1, 2], [], None, [None]], ), ( plc.lists.union_distinct, - False, - True, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [ [np.nan, 2, 1, 3], [1, 2, 3, 4, 5], @@ -318,8 +318,8 @@ def test_sort_lists(lists_column, ascending, na_position, expected): ), ( plc.lists.union_distinct, - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [ [np.nan, np.nan, 2, 1, np.nan, 3], [1, 2, 3, 4, 5], @@ -351,20 +351,24 @@ def test_set_operations( @pytest.mark.parametrize( "nans_equal,nulls_equal,expected", [ - (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]), ( - False, - True, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.EQUAL, + [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], + ), + ( + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.EQUAL, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]], ), ( - True, - False, + plc.types.NanEquality.ALL_EQUAL, + plc.types.NullEquality.UNEQUAL, [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], ), ( - False, - False, + plc.types.NanEquality.UNEQUAL, + plc.types.NullEquality.UNEQUAL, [ [np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py index 3edcae59edc..cd3da856de2 100644 --- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py +++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py @@ -1,12 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.null_mask import MaskState import rmm +import pylibcudf as plc +from pylibcudf.null_mask import MaskState + @pytest.fixture(params=[False, True]) def nullable(request): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py new file mode 100644 index 00000000000..7d6718a959b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + return pa.array( + [ + "e n", + "i t", + "i s", + "e s", + "en t", + "c e", + "es t", + "en ce", + "t est", + "s ent", + ] + ) + + +@pytest.mark.parametrize( + "separator", [None, plc.interop.from_arrow(pa.scalar("e"))] +) +def test_byte_pair_encoding(input_col, separator): + plc_col = plc.interop.from_arrow( + pa.array(["test sentence", "thisis test"]) + ) + result = plc.nvtext.byte_pair_encode.byte_pair_encoding( + plc_col, + plc.nvtext.byte_pair_encode.BPEMergePairs( + plc.interop.from_arrow(input_col) + ), + separator, + ) + if separator is None: + expected = pa.array(["test sent ence", "t h is is test"]) + else: + expected = pa.array(["teste esenteence", "teheiseise etest"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py new file mode 100644 index 00000000000..8b14e0db576 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def edit_distance_data(): + arr1 = ["hallo", "goodbye", "world"] + arr2 = ["hello", "", "world"] + return pa.array(arr1), pa.array(arr2) + + +def test_edit_distance(edit_distance_data): + input_col, targets = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance( + plc.interop.from_arrow(input_col), + plc.interop.from_arrow(targets), + ) + expected = pa.array([1, 7, 0], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_edit_distance_matrix(edit_distance_data): + input_col, _ = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance_matrix( + plc.interop.from_arrow(input_col) + ) + expected = pa.array( + [[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32()) + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py new file mode 100644 index 00000000000..fae4685f81b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["ab", "cde", "fgh"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngram", [2, 3]) +@pytest.mark.parametrize("sep", ["_", "**", ","]) +def test_generate_ngrams(input_col, ngram, sep): + result = plc.nvtext.generate_ngrams.generate_ngrams( + plc.interop.from_arrow(input_col), + ngram, + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"]) + if ngram == 3: + expected = pa.array([f"ab{sep}cde{sep}fgh"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_generate_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]]) + if ngram == 3: + expected = pa.array([[], ["cde"], ["fgh"]]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_hash_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + pa_result = plc.interop.to_arrow(result) + assert all( + len(got) == max(0, len(s.as_py()) - ngram + 1) + for got, s in zip(pa_result, input_col) + ) + assert pa_result.type == pa.list_( + pa.field("element", pa.uint32(), nullable=False) + ) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py new file mode 100644 index 00000000000..05fe7b53c16 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_data(): + input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"] + input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"] + return pa.array(input1), pa.array(input2) + + +@pytest.mark.parametrize("width", [2, 3]) +def test_jaccard_index(input_data, width): + def get_tokens(s, width): + return [s[i : i + width] for i in range(len(s) - width + 1)] + + def jaccard_index(s1, s2, width): + x = set(get_tokens(s1, width)) + y = set(get_tokens(s2, width)) + return len(x & y) / len(x | y) + + input1, input2 = input_data + result = plc.nvtext.jaccard.jaccard_index( + plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width + ) + expected = pa.array( + [ + jaccard_index(s1.as_py(), s2.as_py(), width) + for s1, s2 in zip(input1, input2) + ], + type=pa.float32(), + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py new file mode 100644 index 00000000000..ec533e64307 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest + +import pylibcudf as plc + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def minhash_input_data(request): + input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def word_minhash_input_data(request): + input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.mark.parametrize("width", [5, 12]) +def test_minhash_permuted(minhash_input_data, width): + input_arr, seeds, seed_type = minhash_input_data + minhash_func = ( + plc.nvtext.minhash.minhash_permuted + if seed_type == pa.uint32() + else plc.nvtext.minhash.minhash64_permuted + ) + result = minhash_func( + plc.interop.from_arrow(input_arr), + 0, + plc.interop.from_arrow(seeds), + plc.interop.from_arrow(seeds), + width, + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + ) + + +def test_word_minhash(word_minhash_input_data): + input_arr, seeds, seed_type = word_minhash_input_data + word_minhash_func = ( + plc.nvtext.minhash.word_minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.word_minhash64 + ) + result = word_minhash_func( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds) + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + ) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py new file mode 100644 index 00000000000..84748b5597e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["a*b*c*d", "a b c d", "a-b-c-d", "a*b c-d"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngrams", [2, 3]) +@pytest.mark.parametrize("delim", ["*", " ", "-"]) +@pytest.mark.parametrize("sep", ["_", "&", ","]) +def test_ngrams_tokenize(input_col, ngrams, delim, sep): + def ngrams_tokenize(strings, ngrams, delim, sep): + tokens = [] + for s in strings: + ss = s.split(delim) + for i in range(len(ss) - ngrams + 1): + token = sep.join(ss[i : i + ngrams]) + tokens.append(token) + return tokens + + result = plc.nvtext.ngrams_tokenize.ngrams_tokenize( + plc.interop.from_arrow(input_col), + ngrams, + plc.interop.from_arrow(pa.scalar(delim)), + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array( + ngrams_tokenize(input_col.to_pylist(), ngrams, delim, sep) + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py new file mode 100644 index 00000000000..25b6d1389ec --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def norm_spaces_input_data(): + arr = ["a b", " c d\n", "e \t f "] + return pa.array(arr) + + +@pytest.fixture(scope="module") +def norm_chars_input_data(): + arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + return pa.array(arr) + + +def test_normalize_spaces(norm_spaces_input_data): + result = plc.nvtext.normalize.normalize_spaces( + plc.interop.from_arrow(norm_spaces_input_data) + ) + expected = pa.array(["a b", "c d", "e f"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalize_characters(norm_chars_input_data, do_lower): + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + do_lower, + ) + expected = pa.array( + ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + ) + if not do_lower: + expected = pa.array( + ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py new file mode 100644 index 00000000000..65687f31c85 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"] + return pa.array(arr) + + +@pytest.fixture(scope="module") +def targets(): + arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"] + return pa.array(arr) + + +@pytest.mark.parametrize("delim", ["*", None]) +def test_replace_tokens(input_col, targets, delim): + replacements = pa.array(["slow", "cat", "looked", "rat"]) + result = plc.nvtext.replace.replace_tokens( + plc.interop.from_arrow(input_col), + plc.interop.from_arrow(targets), + plc.interop.from_arrow(replacements), + plc.interop.from_arrow(pa.scalar(delim)) if delim else None, + ) + expected = pa.array(["slow", "cat", "jumps*over the", "rat"]) + if not delim: + expected = pa.array( + ["the quick", "brown fox", "jumps*over the", "lazy dog"] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("min_token_length", [4, 5]) +@pytest.mark.parametrize("replace", ["---", None]) +@pytest.mark.parametrize("delim", ["*", None]) +def test_filter_tokens(input_col, min_token_length, replace, delim): + result = plc.nvtext.replace.filter_tokens( + plc.interop.from_arrow(input_col), + min_token_length, + plc.interop.from_arrow(pa.scalar(replace)) if replace else None, + plc.interop.from_arrow(pa.scalar(delim)) if delim else None, + ) + expected = pa.array( + ["the quick", "brown fox", "jumps*over the", "lazy dog"] + ) + if not delim and not replace and min_token_length == 4: + expected = pa.array([" quick", "brown ", "jumps*over ", "lazy "]) + if not delim and not replace and min_token_length == 5: + expected = pa.array([" quick", "brown ", "jumps*over ", " "]) + if not delim and replace == "---" and min_token_length == 4: + expected = pa.array( + ["--- quick", "brown ---", "jumps*over ---", "lazy ---"] + ) + if not delim and replace == "---" and min_token_length == 5: + expected = pa.array( + ["--- quick", "brown ---", "jumps*over ---", "--- ---"] + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py new file mode 100644 index 00000000000..e7f4a971f08 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["trouble", "toy", "syzygy"] + return pa.array(arr) + + +@pytest.mark.parametrize("check_vowels", [True, False]) +@pytest.mark.parametrize("indices", [[3, 1, 4], 1]) +def test_is_letter(input_col, check_vowels, indices): + def is_letter(s, i, check): + vowels = "aeiouy" + return (s[i] in vowels) == check + + result = plc.nvtext.stemmer.is_letter( + plc.interop.from_arrow(input_col), + check_vowels, + plc.interop.from_arrow(pa.array(indices)) + if isinstance(indices, list) + else indices, + ) + expected = pa.array( + [ + is_letter( + s, + indices[i] if isinstance(indices, list) else indices, + check_vowels, + ) + for i, s in enumerate(input_col.to_pylist()) + ] + ) + assert_column_eq(result, expected) + + +def test_porter_stemmer_measure(input_col): + result = plc.nvtext.stemmer.porter_stemmer_measure( + plc.interop.from_arrow(input_col), + ) + expected = pa.array([1, 1, 2], type=pa.int32()) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py new file mode 100644 index 00000000000..516d0f7f78d --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture +def vocab_file(tmpdir): + hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") + content = "1\n0\n10\n" + coefficients = [65559] * 10 + for c in coefficients: + content = content + str(c) + " 0\n" + table = [0] * 10 + table[0] = 3015668 + content = content + "10\n" + for v in table: + content = content + str(v) + "\n" + content = content + "100\n101\n102\n\n" + hash_file.write(content) + return str(hash_file) + + +@pytest.fixture +def column_input(): + return pa.array(["This is a test"]) + + +@pytest.mark.parametrize("max_sequence_length", [64, 128]) +@pytest.mark.parametrize("stride", [32, 64]) +@pytest.mark.parametrize("do_lower_case", [True, False]) +@pytest.mark.parametrize("do_truncate", [True, False]) +def test_subword_tokenize( + vocab_file, + column_input, + max_sequence_length, + stride, + do_lower_case, + do_truncate, +): + vocab = plc.nvtext.subword_tokenize.HashedVocabulary(vocab_file) + tokens, masks, metadata = plc.nvtext.subword_tokenize.subword_tokenize( + plc.interop.from_arrow(column_input), + vocab, + max_sequence_length, + stride, + do_lower_case, + do_truncate, + ) + expected_tokens = pa.array( + [100] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32() + ) + expected_masks = pa.array( + [1] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32() + ) + expected_metadata = pa.array([0, 0, 3], type=pa.uint32()) + + assert_column_eq(tokens, expected_tokens) + assert_column_eq(masks, expected_masks) + assert_column_eq(metadata, expected_metadata) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py new file mode 100644 index 00000000000..f1b4a5637e1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py @@ -0,0 +1,96 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + return pa.array(["a", "b c", "d.e:f;"]) + + +@pytest.mark.parametrize( + "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))] +) +def test_tokenize_scalar(input_col, delimiter): + result = plc.nvtext.tokenize.tokenize_scalar( + plc.interop.from_arrow(input_col), delimiter + ) + if delimiter is None: + expected = pa.array(["a", "b", "c", "d.e:f;"]) + else: + expected = pa.array(["a", "b c", "d", "e:f;"]) + assert_column_eq(result, expected) + + +def test_tokenize_column(input_col): + delimiters = pa.array([" ", ".", ":", ";"]) + result = plc.nvtext.tokenize.tokenize_column( + plc.interop.from_arrow(input_col), plc.interop.from_arrow(delimiters) + ) + expected = pa.array(["a", "b", "c", "d", "e", "f"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize( + "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))] +) +def test_count_tokens_scalar(input_col, delimiter): + result = plc.nvtext.tokenize.count_tokens_scalar( + plc.interop.from_arrow(input_col), delimiter + ) + if delimiter is None: + expected = pa.array([1, 2, 1], type=pa.int32()) + else: + expected = pa.array([1, 1, 2], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_count_tokens_column(input_col): + delimiters = pa.array([" ", ".", ":", ";"]) + result = plc.nvtext.tokenize.count_tokens_column( + plc.interop.from_arrow(input_col), plc.interop.from_arrow(delimiters) + ) + expected = pa.array([1, 2, 3], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_character_tokenize(input_col): + result = plc.nvtext.tokenize.character_tokenize( + plc.interop.from_arrow(input_col) + ) + expected = pa.array(["a", "b", " ", "c", "d", ".", "e", ":", "f", ";"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize( + "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))] +) +def test_detokenize(input_col, delimiter): + row_indices = pa.array([0, 0, 1]) + result = plc.nvtext.tokenize.detokenize( + plc.interop.from_arrow(input_col), plc.interop.from_arrow(row_indices) + ) + expected = pa.array(["a b c", "d.e:f;"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("default_id", [-1, 0]) +def test_tokenize_with_vocabulary(input_col, default_id): + result = plc.nvtext.tokenize.tokenize_with_vocabulary( + plc.interop.from_arrow(input_col), + plc.nvtext.tokenize.TokenizeVocabulary( + plc.interop.from_arrow(input_col) + ), + plc.interop.from_arrow(pa.scalar(" ")), + default_id, + ) + pa_result = plc.interop.to_arrow(result) + if default_id == -1: + expected = pa.array([[0], [-1, -1], [2]], type=pa_result.type) + else: + expected = pa.array([[0], [0, 0], [2]], type=pa_result.type) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py index 444d0089d2c..c55e54cebc6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def partitioning_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py index bac56691306..e4a24fb1c98 100644 --- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py +++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py @@ -3,10 +3,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + # Map pylibcudf interpolation options to pyarrow options interp_mapping = { plc.types.Interpolation.LINEAR: "linear", diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py index 777315df538..52598f2c462 100644 --- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py +++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("pat", ["(", "*", "\\"]) def test_regex_program_invalid(pat): diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py index 01115bc363a..ef23e23766a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_reshape.py +++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def reshape_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py index 0b30316b9a0..2526580bc13 100644 --- a/python/pylibcudf/pylibcudf/tests/test_round.py +++ b/python/pylibcudf/pylibcudf/tests/test_round.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(params=["float32", "float64"]) def column(request, has_nulls): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py index a1820def0b1..e85cd1cc443 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -2,12 +2,13 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + -@pytest.fixture() +@pytest.fixture def str_data(): pa_data = pa.array(["A", None]) return pa_data, plc.interop.from_arrow(pa_data) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py index 176ccc55b96..3e31c75c38a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def str_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py index 233cc253b14..08ac371fd96 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_case.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def string_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py new file mode 100644 index 00000000000..06b44210d74 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_all_characters_of_type(): + pa_array = pa.array(["1", "A"]) + result = plc.strings.char_types.all_characters_of_type( + plc.interop.from_arrow(pa_array), + plc.strings.char_types.StringCharacterTypes.ALPHA, + plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + expected = pc.utf8_is_alpha(pa_array) + assert_column_eq(result, expected) + + +def test_filter_characters_of_type(): + pa_array = pa.array(["=A="]) + result = plc.strings.char_types.filter_characters_of_type( + plc.interop.from_arrow(pa_array), + plc.strings.char_types.StringCharacterTypes.ALPHANUM, + plc.interop.from_arrow(pa.scalar(" ")), + plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + expected = pc.replace_substring(pa_array, "A", " ") + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py new file mode 100644 index 00000000000..eea3ac68e84 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py @@ -0,0 +1,84 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_concatenate_scalar_seperator(): + plc_table = plc.interop.from_arrow( + pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]}) + ) + sep = plc.interop.from_arrow(pa.scalar("-")) + result = plc.strings.combine.concatenate( + plc_table, + sep, + ) + expected = pa.array(["a-a", "-b", "c-"]) + assert_column_eq(result, expected) + + result = plc.strings.combine.concatenate( + plc_table, sep, narep=plc.interop.from_arrow(pa.scalar("!")) + ) + expected = pa.array(["a-a", "!-b", "c-!"]) + assert_column_eq(result, expected) + + with pytest.raises(ValueError): + plc.strings.combine.concatenate( + plc_table, + sep, + narep=plc.interop.from_arrow(pa.scalar("!")), + col_narep=plc.interop.from_arrow(pa.scalar("?")), + ) + + +def test_concatenate_column_seperator(): + plc_table = plc.interop.from_arrow( + pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]}) + ) + sep = plc.interop.from_arrow(pa.array(["-", "?", ","])) + result = plc.strings.combine.concatenate( + plc_table, + sep, + ) + expected = pa.array(["a-a", "?b", "c,"]) + assert_column_eq(result, expected) + + result = plc.strings.combine.concatenate( + plc_table, + plc.interop.from_arrow(pa.array([None, "?", ","])), + narep=plc.interop.from_arrow(pa.scalar("1")), + col_narep=plc.interop.from_arrow(pa.scalar("*")), + ) + expected = pa.array(["a1a", "*?b", "c,*"]) + assert_column_eq(result, expected) + + +def test_join_strings(): + pa_arr = pa.array(list("abc")) + sep = pa.scalar("") + result = plc.strings.combine.join_strings( + plc.interop.from_arrow(pa_arr), + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + ) + expected = pa.array(["abc"]) + assert_column_eq(result, expected) + + +def test_join_list_elements(): + pa_arr = pa.array([["a", "a"], ["b", "b"]]) + sep = pa.scalar("") + result = plc.strings.combine.join_list_elements( + plc.interop.from_arrow(pa_arr), + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar("")), + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + expected = pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4e4dd7cbb00..ba9a4a7d3b8 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def target_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index e9e95459d0e..3f3f452c4f6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -1,12 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from datetime import datetime - import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture( scope="module", @@ -21,39 +20,16 @@ def timestamp_type(request): return request.param -@pytest.fixture( - scope="module", - params=[ - pa.duration("ns"), - pa.duration("us"), - pa.duration("ms"), - pa.duration("s"), - ], -) -def duration_type(request): - return request.param - - @pytest.fixture(scope="module") def pa_timestamp_col(): return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) -@pytest.fixture(scope="module") -def pa_duration_col(): - return pa.array(["05:20:25"]) - - @pytest.fixture(scope="module") def plc_timestamp_col(pa_timestamp_col): return plc.interop.from_arrow(pa_timestamp_col) -@pytest.fixture(scope="module") -def plc_duration_col(pa_duration_col): - return plc.interop.from_arrow(pa_duration_col) - - @pytest.mark.parametrize("format", ["%Y-%m-%d"]) def test_to_datetime( pa_timestamp_col, plc_timestamp_col, timestamp_type, format @@ -62,24 +38,6 @@ def test_to_datetime( got = plc.strings.convert.convert_datetime.to_timestamps( plc_timestamp_col, plc.interop.from_arrow(timestamp_type), - format.encode(), - ) - assert_column_eq(expect, got) - - -@pytest.mark.parametrize("format", ["%H:%M:%S"]) -def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): - def to_timedelta(duration_str): - date = datetime.strptime(duration_str, format) - return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date - - expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( - duration_type - ) - - got = plc.strings.convert.convert_durations.to_durations( - plc_duration_col, - plc.interop.from_arrow(duration_type), - format.encode(), + format, ) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py new file mode 100644 index 00000000000..b391d2b290e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_to_booleans(): + pa_array = pa.array(["true", None, "True"]) + result = plc.strings.convert.convert_booleans.to_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("True")), + ) + expected = pa.array([False, None, True]) + assert_column_eq(result, expected) + + +def test_from_booleans(): + pa_array = pa.array([True, None, False]) + result = plc.strings.convert.convert_booleans.from_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("A")), + plc.interop.from_arrow(pa.scalar("B")), + ) + expected = pa.array(["A", None, "B"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py new file mode 100644 index 00000000000..c9368d858a4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import datetime + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture +def fmt(): + return "%Y-%m-%dT%H:%M:%S" + + +def test_to_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None]) + result = plc.strings.convert.convert_datetime.to_timestamps( + plc.interop.from_arrow(arr), + plc.DataType(plc.TypeId.TIMESTAMP_SECONDS), + fmt, + ) + expected = pc.strptime(arr, fmt, "s") + assert_column_eq(result, expected) + + +def test_from_timestamp(fmt): + arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None]) + result = plc.strings.convert.convert_datetime.from_timestamps( + plc.interop.from_arrow(arr), + fmt, + plc.interop.from_arrow(pa.array([], type=pa.string())), + ) + # pc.strftime will add the extra %f + expected = pa.array(["2020-01-01T01:01:01", None]) + assert_column_eq(result, expected) + + +def test_is_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"]) + result = plc.strings.convert.convert_datetime.is_timestamp( + plc.interop.from_arrow(arr), + fmt, + ) + expected = pa.array([True, None, False]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py new file mode 100644 index 00000000000..2d3578e4e71 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime, timedelta + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture( + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +def test_to_duration(pa_duration_col, plc_duration_col, duration_type): + format = "%H:%M:%S" + + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format, + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", [None, "%D days %H:%M:%S"]) +def test_from_durations(format): + pa_array = pa.array( + [timedelta(days=1, hours=1, minutes=1, seconds=1), None] + ) + result = plc.strings.convert.convert_durations.from_durations( + plc.interop.from_arrow(pa_array), format + ) + expected = pa.array(["1 days 01:01:01", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py new file mode 100644 index 00000000000..012e722038e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import decimal + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_to_fixed_point(): + typ = pa.decimal128(38, 2) + arr = pa.array(["123", "1.23", None]) + result = plc.strings.convert.convert_fixed_point.to_fixed_point( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_fixed_point(): + arr = pa.array([decimal.Decimal("1.1"), None]) + result = plc.strings.convert.convert_fixed_point.from_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["1.1", None]) + assert_column_eq(result, expected) + + +def test_is_fixed_point(): + arr = pa.array(["123", "1.23", "1.2.3", "", None]) + result = plc.strings.convert.convert_fixed_point.is_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py new file mode 100644 index 00000000000..8ee2b5075af --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_to_floats(): + typ = pa.float32() + arr = pa.array(["-1.23", "1", None]) + result = plc.strings.convert.convert_floats.to_floats( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_floats(): + arr = pa.array([-1.23, 1, None]) + result = plc.strings.convert.convert_floats.from_floats( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["-1.23", "1.0", None]) + assert_column_eq(result, expected) + + +def test_is_float(): + arr = pa.array(["-1.23", "1", "1.2.3", "A", None]) + result = plc.strings.convert.convert_floats.is_float( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py new file mode 100644 index 00000000000..01192c2d1f8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_to_integers(): + typ = pa.int8() + arr = pa.array(["1", "-1", None]) + result = plc.strings.convert.convert_integers.to_integers( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_integers(): + arr = pa.array([1, -1, None]) + result = plc.strings.convert.convert_integers.from_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["1", "-1", None]) + assert_column_eq(result, expected) + + +def test_is_integer(): + arr = pa.array(["1", "-1", "1.2", "A", None]) + plc_column = plc.interop.from_arrow(arr) + result = plc.strings.convert.convert_integers.is_integer(plc_column) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) + + result = plc.strings.convert.convert_integers.is_integer( + plc_column, plc.interop.from_arrow(pa.uint8()) + ) + expected = pa.array([True, False, False, False, None]) + assert_column_eq(result, expected) + + +def test_hex_to_integers(): + typ = pa.int32() + data = ["0xff", "0x2a", None] + result = plc.strings.convert.convert_integers.hex_to_integers( + plc.interop.from_arrow(pa.array(data)), plc.interop.from_arrow(typ) + ) + expected = pa.array( + [int(val, 16) if isinstance(val, str) else val for val in data], + type=typ, + ) + assert_column_eq(result, expected) + + +def test_is_hex(): + arr = pa.array(["0xff", "123", "!", None]) + result = plc.strings.convert.convert_integers.is_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, True, False, None]) + assert_column_eq(result, expected) + + +def test_integers_to_hex(): + data = [255, -42, None] + arr = pa.array(data) + result = plc.strings.convert.convert_integers.integers_to_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["FF", "FFFFFFFFFFFFFFD6", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py new file mode 100644 index 00000000000..b533809f106 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_ipv4_to_integers(): + arr = pa.array(["123.45.67.890", None]) + result = plc.strings.convert.convert_ipv4.ipv4_to_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array([2066564730, None], type=pa.uint32()) + assert_column_eq(result, expected) + + +def test_integers_to_ipv4(): + arr = pa.array([1, 0, None], type=pa.uint32()) + result = plc.strings.convert.convert_ipv4.integers_to_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["0.0.0.1", "0.0.0.0", None]) + assert_column_eq(result, expected) + + +def test_is_ipv4(): + arr = pa.array(["0.0.0.1", "1.2.34", "A", None]) + result = plc.strings.convert.convert_ipv4.is_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py new file mode 100644 index 00000000000..737036a4f0f --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize("na_rep", [None, pa.scalar("")]) +@pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])]) +def test_format_list_column(na_rep, separators): + arr = pa.array([["1", "A"], None]) + result = plc.strings.convert.convert_lists.format_list_column( + plc.interop.from_arrow(arr), + na_rep if na_rep is None else plc.interop.from_arrow(na_rep), + separators + if separators is None + else plc.interop.from_arrow(separators), + ) + expected = pa.array(["[1,A]", ""]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py new file mode 100644 index 00000000000..528736798c7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import urllib + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_url_encode(): + data = ["/home/nfs", None] + arr = pa.array(data) + result = plc.strings.convert.convert_urls.url_encode( + plc.interop.from_arrow(arr) + ) + expected = pa.array( + [ + urllib.parse.quote(url, safe="") if isinstance(url, str) else url + for url in data + ] + ) + assert_column_eq(result, expected) + + +def test_url_decode(): + data = ["%2Fhome%2fnfs", None] + arr = pa.array(data) + result = plc.strings.convert.convert_urls.url_decode( + plc.interop.from_arrow(arr) + ) + expected = pa.array( + [ + urllib.parse.unquote(url) if isinstance(url, str) else url + for url in data + ] + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py index 788b86423c4..e70edf4fb33 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -2,6 +2,7 @@ import pyarrow as pa import pyarrow.compute as pc + import pylibcudf as plc diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py index db3b13a5aae..82ec18832a9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_find.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py new file mode 100644 index 00000000000..fa9eee3594b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_find_multiple(): + arr = pa.array(["abc", "def"]) + targets = pa.array(["a", "c", "e"]) + result = plc.strings.find_multiple.find_multiple( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(targets), + ) + expected = pa.array( + [ + [elem.find(target) for target in targets.to_pylist()] + for elem in arr.to_pylist() + ], + type=pa.list_(pa.int32()), + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py index 994552fa276..b73d812c898 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -2,9 +2,10 @@ import re import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_findall(): arr = pa.array(["bunny", "rabbit", "hare", "dog"]) @@ -21,3 +22,20 @@ def test_findall(): type=pa_result.type, ) assert_column_eq(result, expected) + + +def test_find_re(): + arr = pa.array(["bunny", "rabbit", "hare", "dog"]) + pattern = "[eb]" + result = plc.strings.findall.find_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + pa_result = plc.interop.to_arrow(result) + expected = pa.array( + [0, 2, 3, -1], + type=pa_result.type, + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py new file mode 100644 index 00000000000..79498132097 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc + +import pylibcudf as plc + + +def test_pad(): + arr = pa.array(["a", "1", None]) + plc_result = plc.strings.padding.pad( + plc.interop.from_arrow(arr), + 2, + plc.strings.side_type.SideType.LEFT, + "!", + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="!")) + assert result.equals(expected) + + +def test_zfill(): + arr = pa.array(["a", "1", None]) + plc_result = plc.strings.padding.zfill(plc.interop.from_arrow(arr), 2) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="0")) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py index 18b5d8bf4d0..c06c06be7c6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -2,9 +2,10 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) def test_repeat_strings(repeats): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py index 5a9c2007b73..2c7d25133de 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py new file mode 100644 index 00000000000..511f826441a --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize("max_replace_count", [-1, 1]) +def test_replace_re_regex_program_scalar(max_replace_count): + arr = pa.array(["foo", "fuz", None]) + pat = "f." + repl = "ba" + result = plc.strings.replace_re.replace_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + plc.interop.from_arrow(pa.scalar(repl)), + max_replace_count=max_replace_count, + ) + expected = pc.replace_substring_regex( + arr, + pat, + repl, + max_replacements=max_replace_count + if max_replace_count != -1 + else None, + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize( + "flags", + [ + plc.strings.regex_flags.RegexFlags.DEFAULT, + plc.strings.regex_flags.RegexFlags.DOTALL, + ], +) +def test_replace_re_list_str_columns(flags): + arr = pa.array(["foo", "fuz", None]) + pats = ["oo", "uz"] + repls = ["a", "b"] + result = plc.strings.replace_re.replace_re( + plc.interop.from_arrow(arr), + pats, + plc.interop.from_arrow(pa.array(repls)), + flags=flags, + ) + expected = arr + for pat, repl in zip(pats, repls): + expected = pc.replace_substring_regex( + expected, + pat, + repl, + ) + assert_column_eq(result, expected) + + +def test_replace_with_backrefs(): + arr = pa.array(["Z756", None]) + result = plc.strings.replace_re.replace_with_backrefs( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + "(\\d)(\\d)", plc.strings.regex_flags.RegexFlags.DEFAULT + ), + "V\\2\\1", + ) + expected = pa.array(["ZV576", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py index d9ce5591b98..1759f739e31 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def pa_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py new file mode 100644 index 00000000000..4e80f19b814 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_table_eq + +import pylibcudf as plc + + +@pytest.fixture +def data_col(): + pa_arr = pa.array(["ab_cd", "def_g_h", None]) + plc_column = plc.interop.from_arrow(pa_arr) + return pa_arr, plc_column + + +def test_partition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.partition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def", None], + "b": ["_", "_", None], + "c": ["cd", "g_h", None], + } + ) + assert_table_eq(expected, result) + + +def test_rpartition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.rpartition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def_g", None], + "b": ["_", "_", None], + "c": ["cd", "h", None], + } + ) + assert_table_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py new file mode 100644 index 00000000000..450b336ce65 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py @@ -0,0 +1,131 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq, assert_table_eq + +import pylibcudf as plc + + +@pytest.fixture +def data_col(): + pa_array = pa.array(["a_b_c", "d-e-f", None]) + plc_column = plc.interop.from_arrow(pa_array) + return pa_array, plc_column + + +@pytest.fixture +def delimiter(): + delimiter = "_" + plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter)) + return delimiter, plc_delimiter + + +@pytest.fixture +def re_delimiter(): + return "[_-]" + + +def test_split(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.split(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a", "d-e-f", None], + "b": ["b_c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a_b", "d-e-f", None], + "b": ["c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_split_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.split_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a", "d", None], + "b": ["b_c", "e-f", None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.rsplit_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a_b", "d-e", None], + "b": ["c", "f", None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.split_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.rsplit_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + -1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py index 005e5e4a405..5869e5f4920 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + data_strings = [ "AbC", "123abc", diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py new file mode 100644 index 00000000000..84fd3354ac6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture +def data_col(): + pa_data_col = pa.array( + ["aa", "bbb", "cccc", "abcd", None], + type=pa.string(), + ) + return pa_data_col, plc.interop.from_arrow(pa_data_col) + + +@pytest.fixture +def trans_table(): + return str.maketrans("abd", "A Q") + + +def test_translate(data_col, trans_table): + pa_array, plc_col = data_col + result = plc.strings.translate.translate(plc_col, trans_table) + expected = pa.array( + [ + val.translate(trans_table) if isinstance(val, str) else None + for val in pa_array.to_pylist() + ] + ) + assert_column_eq(expected, result) + + +@pytest.mark.parametrize( + "keep", + [ + plc.strings.translate.FilterType.KEEP, + plc.strings.translate.FilterType.REMOVE, + ], +) +def test_filter_characters(data_col, trans_table, keep): + pa_array, plc_col = data_col + result = plc.strings.translate.filter_characters( + plc_col, trans_table, keep, plc.interop.from_arrow(pa.scalar("*")) + ) + exp_data = [] + flat_trans = set(trans_table.keys()).union(trans_table.values()) + for val in pa_array.to_pylist(): + if not isinstance(val, str): + exp_data.append(val) + else: + new_val = "" + for ch in val: + if ( + ch in flat_trans + and keep == plc.strings.translate.FilterType.KEEP + ): + new_val += ch + elif ( + ch not in flat_trans + and keep == plc.strings.translate.FilterType.REMOVE + ): + new_val += ch + else: + new_val += "*" + exp_data.append(new_val) + expected = pa.array(exp_data) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py new file mode 100644 index 00000000000..00442d866e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import textwrap + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_wrap(): + width = 12 + pa_array = pa.array( + [ + "the quick brown fox jumped over the lazy brown dog", + "hello, world", + None, + ] + ) + result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width) + expected = pa.array( + [ + textwrap.fill(val, width) if isinstance(val, str) else val + for val in pa_array.to_pylist() + ] + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py index e822d6a97a8..ac39ef4c5c9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_table.py +++ b/python/pylibcudf/pylibcudf/tests/test_table.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize( "arrow_tbl", diff --git a/python/pylibcudf/pylibcudf/tests/test_traits.py b/python/pylibcudf/pylibcudf/tests/test_traits.py index 2570e8abd51..2c1708304eb 100644 --- a/python/pylibcudf/pylibcudf/tests/test_traits.py +++ b/python/pylibcudf/pylibcudf/tests/test_traits.py @@ -20,6 +20,11 @@ def test_is_numeric(): assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST)) +def test_is_numeric_not_bool(): + assert plc.traits.is_numeric_not_bool(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_numeric_not_bool(plc.DataType(plc.TypeId.BOOL8)) + + def test_is_index_type(): assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8)) assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8)) diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py index d5c618f07e4..49802fe64ac 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/pylibcudf/tests/test_transform.py @@ -3,9 +3,10 @@ import math import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_nans_to_nulls(has_nans): if has_nans: diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py index ac11123f680..b0c0bc72ead 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transpose.py +++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from packaging.version import parse +import pylibcudf as plc + @pytest.mark.skipif( parse(pa.__version__) < parse("16.0.0"), diff --git a/python/pylibcudf/pylibcudf/traits.pyi b/python/pylibcudf/pylibcudf/traits.pyi new file mode 100644 index 00000000000..fdb31a262cf --- /dev/null +++ b/python/pylibcudf/pylibcudf/traits.pyi @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.types import DataType + +def is_relationally_comparable(typ: DataType) -> bool: ... +def is_equality_comparable(typ: DataType) -> bool: ... +def is_numeric(typ: DataType) -> bool: ... +def is_numeric_not_bool(typ: DataType) -> bool: ... +def is_index_type(typ: DataType) -> bool: ... +def is_unsigned(typ: DataType) -> bool: ... +def is_integral(typ: DataType) -> bool: ... +def is_integral_not_bool(typ: DataType) -> bool: ... +def is_floating_point(typ: DataType) -> bool: ... +def is_boolean(typ: DataType) -> bool: ... +def is_timestamp(typ: DataType) -> bool: ... +def is_fixed_point(typ: DataType) -> bool: ... +def is_duration(typ: DataType) -> bool: ... +def is_chrono(typ: DataType) -> bool: ... +def is_dictionary(typ: DataType) -> bool: ... +def is_fixed_width(typ: DataType) -> bool: ... +def is_compound(typ: DataType) -> bool: ... +def is_nested(typ: DataType) -> bool: ... +def is_bit_castable(source: DataType, target: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx index 5a1c67e1f6c..3cf0a3a4b3b 100644 --- a/python/pylibcudf/pylibcudf/traits.pyx +++ b/python/pylibcudf/pylibcudf/traits.pyx @@ -5,6 +5,27 @@ from pylibcudf.libcudf.utilities cimport traits from .types cimport DataType +__all__ = [ + "is_bit_castable", + "is_boolean", + "is_chrono", + "is_compound", + "is_dictionary", + "is_duration", + "is_equality_comparable", + "is_fixed_point", + "is_fixed_width", + "is_floating_point", + "is_index_type", + "is_integral", + "is_integral_not_bool", + "is_nested", + "is_numeric", + "is_numeric_not_bool", + "is_relationally_comparable", + "is_timestamp", + "is_unsigned", +] cpdef bool is_relationally_comparable(DataType typ): """Checks if the given data type supports relational comparisons. @@ -29,6 +50,12 @@ cpdef bool is_numeric(DataType typ): """ return traits.is_numeric(typ.c_obj) +cpdef bool is_numeric_not_bool(DataType typ): + """Checks if the given data type is numeric excluding booleans. + + For details, see :cpp:func:`is_numeric_not_bool`. + """ + return traits.is_numeric_not_bool(typ.c_obj) cpdef bool is_index_type(DataType typ): """Checks if the given data type is an index type. diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd index b530f433c97..4fb623158f0 100644 --- a/python/pylibcudf/pylibcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/transform.pxd @@ -3,6 +3,7 @@ from libcpp cimport bool from pylibcudf.libcudf.types cimport bitmask_type, data_type from .column cimport Column +from .expressions cimport Expression from .gpumemoryview cimport gpumemoryview from .table cimport Table from .types cimport DataType @@ -10,6 +11,8 @@ from .types cimport DataType cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) +cpdef Column compute_column(Table input, Expression expr) + cpdef tuple[gpumemoryview, int] bools_to_mask(Column input) cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit) diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi new file mode 100644 index 00000000000..5cbd2e635f0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/transform.pyi @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column import Column +from pylibcudf.expressions import Expression +from pylibcudf.gpumemoryview import gpumemoryview +from pylibcudf.table import Table +from pylibcudf.types import DataType + +def nans_to_nulls(input: Column) -> tuple[gpumemoryview, int]: ... +def compute_column(input: Table, expr: Expression) -> Column: ... +def bools_to_mask(input: Column) -> tuple[gpumemoryview, int]: ... +def mask_to_bools(bitmask: int, begin_bit: int, end_bit: int) -> Column: ... +def transform( + input: Column, unary_udf: str, output_type: DataType, is_ptx: bool +) -> Column: ... +def encode(input: Table) -> tuple[Table, Column]: ... +def one_hot_encode(input: Column, categories: Column) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index de425a27c15..9700bcff221 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move, pair @@ -9,13 +10,23 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .column cimport Column from .gpumemoryview cimport gpumemoryview from .types cimport DataType from .utils cimport int_to_bitmask_ptr +__all__ = [ + "bools_to_mask", + "compute_column", + "encode", + "mask_to_bools", + "nans_to_nulls", + "one_hot_encode", + "transform", +] cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): """Create a null mask preserving existing nulls and converting nans to null. @@ -34,7 +45,7 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): cdef pair[unique_ptr[device_buffer], size_type] c_result with nogil: - c_result = move(cpp_transform.nans_to_nulls(input.view())) + c_result = cpp_transform.nans_to_nulls(input.view()) return ( gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), @@ -42,6 +53,32 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): ) +cpdef Column compute_column(Table input, Expression expr): + """Create a column by evaluating an expression on a table. + + For details see :cpp:func:`compute_column`. + + Parameters + ---------- + input : Table + Table used for expression evaluation + expr : Expression + Expression to evaluate + + Returns + ------- + Column of the evaluated expression + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_transform.compute_column( + input.view(), dereference(expr.c_obj.get()) + ) + + return Column.from_libcudf(move(c_result)) + + cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): """Create a bitmask from a column of boolean elements @@ -58,7 +95,7 @@ cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): cdef pair[unique_ptr[device_buffer], size_type] c_result with nogil: - c_result = move(cpp_transform.bools_to_mask(input.view())) + c_result = cpp_transform.bools_to_mask(input.view()) return ( gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), @@ -87,7 +124,7 @@ cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask) with nogil: - c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)) + c_result = cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit) return Column.from_libcudf(move(c_result)) @@ -118,10 +155,8 @@ cpdef Column transform(Column input, str unary_udf, DataType output_type, bool i cdef bool c_is_ptx = is_ptx with nogil: - c_result = move( - cpp_transform.transform( - input.view(), c_unary_udf, output_type.c_obj, c_is_ptx - ) + c_result = cpp_transform.transform( + input.view(), c_unary_udf, output_type.c_obj, c_is_ptx ) return Column.from_libcudf(move(c_result)) @@ -143,7 +178,7 @@ cpdef tuple[Table, Column] encode(Table input): cdef pair[unique_ptr[table], unique_ptr[column]] c_result with nogil: - c_result = move(cpp_transform.encode(input.view())) + c_result = cpp_transform.encode(input.view()) return ( Table.from_libcudf(move(c_result.first)), @@ -171,7 +206,7 @@ cpdef Table one_hot_encode(Column input, Column categories): cdef Table owner_table with nogil: - c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view())) + c_result = cpp_transform.one_hot_encode(input.view(), categories.view()) owner_table = Table( [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() diff --git a/python/pylibcudf/pylibcudf/transpose.pyi b/python/pylibcudf/pylibcudf/transpose.pyi new file mode 100644 index 00000000000..a84ab8a60ea --- /dev/null +++ b/python/pylibcudf/pylibcudf/transpose.pyi @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.table import Table + +def transpose(input_table: Table) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx index a708f6cc37f..5eb3e58cebc 100644 --- a/python/pylibcudf/pylibcudf/transpose.pyx +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -9,6 +9,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .column cimport Column from .table cimport Table +__all__ = ["transpose"] cpdef Table transpose(Table input_table): """Transpose a Table. @@ -29,7 +30,7 @@ cpdef Table transpose(Table input_table): cdef Table owner_table with nogil: - c_result = move(cpp_transpose.transpose(input_table.view())) + c_result = cpp_transpose.transpose(input_table.view()) owner_table = Table( [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() diff --git a/python/pylibcudf/pylibcudf/types.pyi b/python/pylibcudf/pylibcudf/types.pyi new file mode 100644 index 00000000000..c91a95414bd --- /dev/null +++ b/python/pylibcudf/pylibcudf/types.pyi @@ -0,0 +1,86 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from enum import IntEnum +from typing import Final + +class Interpolation(IntEnum): + LINEAR = ... + LOWER = ... + HIGHER = ... + MIDPOINT = ... + NEAREST = ... + +class MaskState(IntEnum): + UNALLOCATED = ... + UNINITIALIZED = ... + ALL_VALID = ... + ALL_NULL = ... + +class NanEquality(IntEnum): + ALL_EQUAL = ... + UNEQUAL = ... + +class NanPolicy(IntEnum): + NAN_IS_NULL = ... + NAN_IS_VALID = ... + +class NullEquality(IntEnum): + EQUAL = ... + UNEQUAL = ... + +class NullOrder(IntEnum): + AFTER = ... + BEFORE = ... + +class NullPolicy(IntEnum): + EXCLUDE = ... + INCLUDE = ... + +class Order(IntEnum): + ASCENDING = ... + DESCENDING = ... + +class Sorted(IntEnum): + NO = ... + YES = ... + +class TypeId(IntEnum): + EMPTY = ... + INT8 = ... + INT16 = ... + INT32 = ... + INT64 = ... + UINT8 = ... + UINT16 = ... + UINT32 = ... + UINT64 = ... + FLOAT32 = ... + FLOAT64 = ... + BOOL8 = ... + TIMESTAMP_DAYS = ... + TIMESTAMP_SECONDS = ... + TIMESTAMP_MILLISECONDS = ... + TIMESTAMP_MICROSECONDS = ... + TIMESTAMP_NANOSECONDS = ... + DURATION_DAYS = ... + DURATION_SECONDS = ... + DURATION_MILLISECONDS = ... + DURATION_MICROSECONDS = ... + DURATION_NANOSECONDS = ... + DICTIONARY32 = ... + STRING = ... + LIST = ... + DECIMAL32 = ... + DECIMAL64 = ... + DECIMAL128 = ... + STRUCT = ... + NUM_TYPE_IDS = ... + +class DataType: + def __init__(self, type_id: TypeId, scale: int = 0): ... + def id(self) -> TypeId: ... + def scale(self) -> int: ... + +def size_of(t: DataType) -> int: ... + +SIZE_TYPE: Final[DataType] +SIZE_TYPE_ID: Final[TypeId] diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx index 58c7d97e9bc..afa1b56f38a 100644 --- a/python/pylibcudf/pylibcudf/types.pyx +++ b/python/pylibcudf/pylibcudf/types.pyx @@ -20,6 +20,22 @@ from pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, i from pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip from pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip +__all__ = [ + "DataType", + "Interpolation", + "MaskState", + "NanEquality", + "NanPolicy", + "NullEquality", + "NullOrder", + "NullPolicy", + "Order", + "SIZE_TYPE", + "SIZE_TYPE_ID", + "Sorted", + "TypeId", + "size_of" +] cdef class DataType: """Indicator for the logical data type of an element in a column. @@ -79,6 +95,16 @@ cpdef size_type size_of(DataType t): Only fixed-width types are supported. For details, see :cpp:func:`size_of`. + + Parameters + ---------- + t : DataType + The DataType to get the size of. + + Returns + ------- + int + Size in bytes of an element of the specified type. """ with nogil: return cpp_size_of(t.c_obj) diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi new file mode 100644 index 00000000000..7aa23b618f4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/unary.pyi @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from enum import IntEnum + +from pylibcudf.column import Column +from pylibcudf.types import DataType + +class UnaryOperator(IntEnum): + SIN = ... + COS = ... + TAN = ... + ARCSIN = ... + ARCCOS = ... + ARCTAN = ... + SINH = ... + COSH = ... + TANH = ... + ARCSINH = ... + ARCCOSH = ... + ARCTANH = ... + EXP = ... + LOG = ... + SQRT = ... + CBRT = ... + CEIL = ... + FLOOR = ... + ABS = ... + RINT = ... + BIT_INVERT = ... + NOT = ... + +def unary_operation(input: Column, op: UnaryOperator) -> Column: ... +def is_null(input: Column) -> Column: ... +def is_valid(input: Column) -> Column: ... +def cast(input: Column, data_type: DataType) -> Column: ... +def is_nan(input: Column) -> Column: ... +def is_not_nan(input: Column) -> Column: ... +def is_supported_cast(from_: DataType, to: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx index 839360ef406..b738ab53d1b 100644 --- a/python/pylibcudf/pylibcudf/unary.pyx +++ b/python/pylibcudf/pylibcudf/unary.pyx @@ -13,6 +13,16 @@ from pylibcudf.libcudf.unary import \ from .column cimport Column from .types cimport DataType +__all__ = [ + "UnaryOperator", + "cast", + "is_nan", + "is_not_nan", + "is_null", + "is_supported_cast", + "is_valid", + "unary_operation", +] cpdef Column unary_operation(Column input, unary_operator op): """Perform a unary operation on a column. @@ -34,7 +44,7 @@ cpdef Column unary_operation(Column input, unary_operator op): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.unary_operation(input.view(), op)) + result = cpp_unary.unary_operation(input.view(), op) return Column.from_libcudf(move(result)) @@ -57,7 +67,7 @@ cpdef Column is_null(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_null(input.view())) + result = cpp_unary.is_null(input.view()) return Column.from_libcudf(move(result)) @@ -80,7 +90,7 @@ cpdef Column is_valid(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_valid(input.view())) + result = cpp_unary.is_valid(input.view()) return Column.from_libcudf(move(result)) @@ -105,7 +115,7 @@ cpdef Column cast(Column input, DataType data_type): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.cast(input.view(), data_type.c_obj)) + result = cpp_unary.cast(input.view(), data_type.c_obj) return Column.from_libcudf(move(result)) @@ -128,7 +138,7 @@ cpdef Column is_nan(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_nan(input.view())) + result = cpp_unary.is_nan(input.view()) return Column.from_libcudf(move(result)) @@ -151,7 +161,7 @@ cpdef Column is_not_nan(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_not_nan(input.view())) + result = cpp_unary.is_not_nan(input.view()) return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 143e538091f..e83db47830c 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,12 +18,13 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cuda-python>=11.7.1,<12.0a0", - "libcudf==24.10.*", + "cuda-python>=11.7.1,<12.0a0,<=11.8.3", + "libcudf==24.12.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", - "pyarrow>=14.0.0,<18.0.0a0", - "rmm==24.10.*", + "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'", + "rmm==24.12.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -52,48 +53,58 @@ test = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint] +extend-select = [ + "TCH", # flake8-type-checking + "TID", # flake8-tidy-imports + "PT", # flake8-pytest-style ] -known_first_party = [ - "cudf", +extend-ignore = [ + "PT011", # pytest.raises(...) is too broad ] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", + +[tool.ruff.lint.flake8-pytest-style] +# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style +fixture-parentheses = false +mark-parentheses = false +parametrize-names-type = "csv" +parametrize-values-type = "list" +parametrize-values-row-type = "tuple" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["pylibcudf"] +section-order = ["future", "standard-library", "third-party", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +rapids = ["rmm"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] + +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", ] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M' + +[tool.pytest.ini_options] +# --import-mode=importlib because two test_json.py exists and tests directory is not a structured module +addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", + "ignore:::.*xdist.*", + "ignore:::.*pytest.*" ] +xfail_strict = true [tool.rapids-build-backend] build-backend = "scikit_build_core.build" @@ -102,10 +113,10 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", - "libcudf==24.10.*", - "librmm==24.10.*", + "libcudf==24.12.*,>=0.0.0a0", + "librmm==24.12.*,>=0.0.0a0", "ninja", - "rmm==24.10.*", + "rmm==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [tool.scikit-build]